From b333b06772c89d96aacb5490d6a219fba7c09cc6 Mon Sep 17 00:00:00 2001 From: Mitja Felicijan Date: Thu, 12 Feb 2026 20:57:17 +0100 Subject: Engage! --- llama.cpp/tools/CMakeLists.txt | 40 + llama.cpp/tools/batched-bench/CMakeLists.txt | 8 + llama.cpp/tools/batched-bench/README.md | 60 + llama.cpp/tools/batched-bench/batched-bench.cpp | 256 + llama.cpp/tools/cli/CMakeLists.txt | 10 + llama.cpp/tools/cli/README.md | 192 + llama.cpp/tools/cli/cli.cpp | 421 + llama.cpp/tools/completion/CMakeLists.txt | 8 + llama.cpp/tools/completion/README.md | 578 ++ llama.cpp/tools/completion/completion.cpp | 1001 +++ llama.cpp/tools/cvector-generator/CMakeLists.txt | 8 + llama.cpp/tools/cvector-generator/README.md | 45 + llama.cpp/tools/cvector-generator/completions.txt | 582 ++ .../tools/cvector-generator/cvector-generator.cpp | 508 ++ llama.cpp/tools/cvector-generator/mean.hpp | 48 + llama.cpp/tools/cvector-generator/negative.txt | 4 + llama.cpp/tools/cvector-generator/pca.hpp | 315 + llama.cpp/tools/cvector-generator/positive.txt | 4 + llama.cpp/tools/export-lora/CMakeLists.txt | 8 + llama.cpp/tools/export-lora/README.md | 33 + llama.cpp/tools/export-lora/export-lora.cpp | 434 + llama.cpp/tools/fit-params/CMakeLists.txt | 8 + llama.cpp/tools/fit-params/README.md | 55 + llama.cpp/tools/fit-params/fit-params.cpp | 66 + llama.cpp/tools/gguf-split/CMakeLists.txt | 8 + llama.cpp/tools/gguf-split/README.md | 10 + llama.cpp/tools/gguf-split/gguf-split.cpp | 583 ++ llama.cpp/tools/gguf-split/tests.sh | 89 + llama.cpp/tools/imatrix/CMakeLists.txt | 13 + llama.cpp/tools/imatrix/README.md | 98 + llama.cpp/tools/imatrix/imatrix.cpp | 1302 +++ llama.cpp/tools/llama-bench/CMakeLists.txt | 8 + llama.cpp/tools/llama-bench/README.md | 349 + llama.cpp/tools/llama-bench/llama-bench.cpp | 2291 +++++ llama.cpp/tools/mtmd/CMakeLists.txt | 96 + llama.cpp/tools/mtmd/README.md | 63 + llama.cpp/tools/mtmd/clip-graph.h | 117 + llama.cpp/tools/mtmd/clip-impl.h | 582 ++ llama.cpp/tools/mtmd/clip-model.h | 389 + llama.cpp/tools/mtmd/clip.cpp | 4080 +++++++++ llama.cpp/tools/mtmd/clip.h | 121 + llama.cpp/tools/mtmd/deprecation-warning.cpp | 22 + .../legacy-models/convert_image_encoder_to_gguf.py | 412 + .../glmedge-convert-image-encoder-to-gguf.py | 280 + .../tools/mtmd/legacy-models/glmedge-surgery.py | 33 + .../tools/mtmd/legacy-models/llava_surgery.py | 38 + .../tools/mtmd/legacy-models/llava_surgery_v2.py | 180 + .../minicpmv-convert-image-encoder-to-gguf.py | 892 ++ .../tools/mtmd/legacy-models/minicpmv-surgery.py | 47 + llama.cpp/tools/mtmd/models/cogvlm.cpp | 98 + llama.cpp/tools/mtmd/models/conformer.cpp | 216 + llama.cpp/tools/mtmd/models/glm4v.cpp | 120 + llama.cpp/tools/mtmd/models/internvl.cpp | 69 + llama.cpp/tools/mtmd/models/kimik25.cpp | 101 + llama.cpp/tools/mtmd/models/kimivl.cpp | 63 + llama.cpp/tools/mtmd/models/llama4.cpp | 96 + llama.cpp/tools/mtmd/models/llava.cpp | 374 + llama.cpp/tools/mtmd/models/minicpmv.cpp | 114 + llama.cpp/tools/mtmd/models/mobilenetv5.cpp | 451 + llama.cpp/tools/mtmd/models/models.h | 118 + llama.cpp/tools/mtmd/models/pixtral.cpp | 86 + llama.cpp/tools/mtmd/models/qwen2vl.cpp | 183 + llama.cpp/tools/mtmd/models/qwen3vl.cpp | 193 + llama.cpp/tools/mtmd/models/siglip.cpp | 86 + llama.cpp/tools/mtmd/models/whisper-enc.cpp | 115 + llama.cpp/tools/mtmd/models/youtuvl.cpp | 179 + llama.cpp/tools/mtmd/mtmd-audio.cpp | 730 ++ llama.cpp/tools/mtmd/mtmd-audio.h | 113 + llama.cpp/tools/mtmd/mtmd-cli.cpp | 437 + llama.cpp/tools/mtmd/mtmd-helper.cpp | 521 ++ llama.cpp/tools/mtmd/mtmd-helper.h | 96 + llama.cpp/tools/mtmd/mtmd.cpp | 1151 +++ llama.cpp/tools/mtmd/mtmd.h | 319 + llama.cpp/tools/mtmd/requirements.txt | 5 + llama.cpp/tools/mtmd/test-1.jpeg | Bin 0 -> 124071 bytes llama.cpp/tools/mtmd/test-2.mp3 | Bin 0 -> 140060 bytes llama.cpp/tools/mtmd/tests.sh | 183 + llama.cpp/tools/perplexity/CMakeLists.txt | 8 + llama.cpp/tools/perplexity/README.md | 193 + llama.cpp/tools/perplexity/perplexity.cpp | 2070 +++++ llama.cpp/tools/quantize/CMakeLists.txt | 9 + llama.cpp/tools/quantize/README.md | 171 + llama.cpp/tools/quantize/quantize.cpp | 733 ++ llama.cpp/tools/quantize/tests.sh | 65 + llama.cpp/tools/rpc/CMakeLists.txt | 8 + llama.cpp/tools/rpc/README.md | 104 + llama.cpp/tools/rpc/rpc-server.cpp | 336 + llama.cpp/tools/server/CMakeLists.txt | 70 + llama.cpp/tools/server/README-dev.md | 179 + llama.cpp/tools/server/README.md | 1782 ++++ llama.cpp/tools/server/bench/README.md | 119 + llama.cpp/tools/server/bench/bench.py | 322 + llama.cpp/tools/server/bench/prometheus.yml | 9 + llama.cpp/tools/server/bench/requirements.txt | 2 + llama.cpp/tools/server/bench/script.js | 162 + llama.cpp/tools/server/chat-llama2.sh | 109 + llama.cpp/tools/server/chat.mjs | 131 + llama.cpp/tools/server/chat.sh | 80 + llama.cpp/tools/server/public/index.html.gz | Bin 0 -> 1453103 bytes llama.cpp/tools/server/public/loading.html | 12 + .../tools/server/public_legacy/colorthemes.css | 402 + llama.cpp/tools/server/public_legacy/completion.js | 209 + llama.cpp/tools/server/public_legacy/favicon.ico | Bin 0 -> 4122 bytes .../tools/server/public_legacy/index-new.html | 1190 +++ llama.cpp/tools/server/public_legacy/index.html | 1301 +++ llama.cpp/tools/server/public_legacy/index.js | 1 + .../public_legacy/json-schema-to-grammar.mjs | 856 ++ llama.cpp/tools/server/public_legacy/loading.html | 12 + .../tools/server/public_legacy/prompt-formats.js | 331 + llama.cpp/tools/server/public_legacy/style.css | 954 ++ .../tools/server/public_legacy/system-prompts.js | 68 + .../server/public_legacy/theme-beeninorder.css | 228 + .../tools/server/public_legacy/theme-ketivah.css | 201 + .../server/public_legacy/theme-mangotango.css | 216 + .../server/public_legacy/theme-playground.css | 221 + .../server/public_legacy/theme-polarnight.css | 253 + .../tools/server/public_legacy/theme-snowstorm.css | 251 + .../tools/server/public_simplechat/datautils.mjs | 266 + .../tools/server/public_simplechat/index.html | 51 + llama.cpp/tools/server/public_simplechat/readme.md | 286 + .../tools/server/public_simplechat/simplechat.css | 79 + .../tools/server/public_simplechat/simplechat.js | 929 ++ .../public_simplechat/simplechat_screens.webp | Bin 0 -> 21376 bytes llama.cpp/tools/server/public_simplechat/ui.mjs | 211 + llama.cpp/tools/server/server-common.cpp | 1980 +++++ llama.cpp/tools/server/server-common.h | 366 + llama.cpp/tools/server/server-context.cpp | 4105 +++++++++ llama.cpp/tools/server/server-context.h | 131 + llama.cpp/tools/server/server-http.cpp | 406 + llama.cpp/tools/server/server-http.h | 78 + llama.cpp/tools/server/server-models.cpp | 1092 +++ llama.cpp/tools/server/server-models.h | 203 + llama.cpp/tools/server/server-queue.cpp | 450 + llama.cpp/tools/server/server-queue.h | 197 + llama.cpp/tools/server/server-task.cpp | 2005 +++++ llama.cpp/tools/server/server-task.h | 620 ++ llama.cpp/tools/server/server.cpp | 322 + llama.cpp/tools/server/tests/.gitignore | 2 + llama.cpp/tools/server/tests/README.md | 96 + llama.cpp/tools/server/tests/conftest.py | 21 + llama.cpp/tools/server/tests/pytest.ini | 4 + llama.cpp/tools/server/tests/requirements.txt | 8 + llama.cpp/tools/server/tests/tests.sh | 23 + llama.cpp/tools/server/tests/unit/test_basic.py | 96 + .../server/tests/unit/test_chat_completion.py | 512 ++ .../server/tests/unit/test_compat_anthropic.py | 896 ++ .../server/tests/unit/test_compat_oai_responses.py | 73 + .../tools/server/tests/unit/test_completion.py | 608 ++ .../tools/server/tests/unit/test_ctx_shift.py | 89 + .../tools/server/tests/unit/test_embedding.py | 257 + llama.cpp/tools/server/tests/unit/test_infill.py | 77 + llama.cpp/tools/server/tests/unit/test_lora.py | 115 + llama.cpp/tools/server/tests/unit/test_rerank.py | 146 + llama.cpp/tools/server/tests/unit/test_router.py | 194 + llama.cpp/tools/server/tests/unit/test_security.py | 127 + llama.cpp/tools/server/tests/unit/test_sleep.py | 39 + .../tools/server/tests/unit/test_slot_save.py | 98 + .../tools/server/tests/unit/test_speculative.py | 131 + llama.cpp/tools/server/tests/unit/test_template.py | 105 + llama.cpp/tools/server/tests/unit/test_tokenize.py | 59 + .../tools/server/tests/unit/test_tool_call.py | 625 ++ .../tools/server/tests/unit/test_vision_api.py | 160 + llama.cpp/tools/server/tests/utils.py | 643 ++ llama.cpp/tools/server/themes/README.md | 5 + .../tools/server/themes/buttons-top/README.md | 7 + .../server/themes/buttons-top/buttons_top.png | Bin 0 -> 119747 bytes .../tools/server/themes/buttons-top/favicon.ico | Bin 0 -> 4122 bytes .../tools/server/themes/buttons-top/index.html | 1052 +++ llama.cpp/tools/server/themes/wild/README.md | 5 + llama.cpp/tools/server/themes/wild/favicon.ico | Bin 0 -> 4122 bytes llama.cpp/tools/server/themes/wild/index.html | 1056 +++ llama.cpp/tools/server/themes/wild/llama_cpp.png | Bin 0 -> 76484 bytes .../tools/server/themes/wild/llamapattern.png | Bin 0 -> 259586 bytes llama.cpp/tools/server/themes/wild/wild.png | Bin 0 -> 496463 bytes llama.cpp/tools/server/webui/.gitignore | 28 + llama.cpp/tools/server/webui/.npmrc | 1 + llama.cpp/tools/server/webui/.prettierignore | 9 + llama.cpp/tools/server/webui/.prettierrc | 16 + .../webui/.storybook/ModeWatcherDecorator.svelte | 36 + .../.storybook/TooltipProviderDecorator.svelte | 13 + llama.cpp/tools/server/webui/.storybook/main.ts | 17 + llama.cpp/tools/server/webui/.storybook/preview.ts | 42 + .../tools/server/webui/.storybook/vitest.setup.ts | 12 + llama.cpp/tools/server/webui/README.md | 687 ++ llama.cpp/tools/server/webui/components.json | 16 + .../high-level-architecture-simplified.md | 106 + .../docs/architecture/high-level-architecture.md | 279 + .../tools/server/webui/docs/flows/chat-flow.md | 174 + .../server/webui/docs/flows/conversations-flow.md | 155 + .../docs/flows/data-flow-simplified-model-mode.md | 45 + .../docs/flows/data-flow-simplified-router-mode.md | 77 + .../tools/server/webui/docs/flows/database-flow.md | 155 + .../tools/server/webui/docs/flows/models-flow.md | 181 + .../tools/server/webui/docs/flows/server-flow.md | 76 + .../tools/server/webui/docs/flows/settings-flow.md | 144 + llama.cpp/tools/server/webui/eslint.config.js | 49 + llama.cpp/tools/server/webui/package-lock.json | 9343 ++++++++++++++++++++ llama.cpp/tools/server/webui/package.json | 94 + llama.cpp/tools/server/webui/playwright.config.ts | 11 + llama.cpp/tools/server/webui/scripts/dev.sh | 57 + .../server/webui/scripts/install-git-hooks.sh | 202 + llama.cpp/tools/server/webui/scripts/post-build.sh | 3 + llama.cpp/tools/server/webui/src/app.css | 138 + llama.cpp/tools/server/webui/src/app.d.ts | 133 + llama.cpp/tools/server/webui/src/app.html | 12 + .../ChatAttachments/ChatAttachmentPreview.svelte | 283 + .../ChatAttachmentThumbnailFile.svelte | 165 + .../ChatAttachmentThumbnailImage.svelte | 64 + .../ChatAttachments/ChatAttachmentsList.svelte | 243 + .../ChatAttachments/ChatAttachmentsViewAll.svelte | 117 + .../components/app/chat/ChatForm/ChatForm.svelte | 315 + .../ChatFormActionFileAttachments.svelte | 123 + .../ChatFormActions/ChatFormActionRecord.svelte | 52 + .../ChatFormActions/ChatFormActionSubmit.svelte | 55 + .../ChatFormActions/ChatFormActions.svelte | 204 + .../ChatForm/ChatFormFileInputInvisible.svelte | 30 + .../app/chat/ChatForm/ChatFormHelperText.svelte | 17 + .../app/chat/ChatForm/ChatFormTextarea.svelte | 59 + .../app/chat/ChatMessages/ChatMessage.svelte | 286 + .../chat/ChatMessages/ChatMessageActions.svelte | 100 + .../chat/ChatMessages/ChatMessageAssistant.svelte | 418 + .../ChatMessageBranchingControls.svelte | 84 + .../chat/ChatMessages/ChatMessageEditForm.svelte | 391 + .../chat/ChatMessages/ChatMessageStatistics.svelte | 175 + .../app/chat/ChatMessages/ChatMessageSystem.svelte | 216 + .../ChatMessages/ChatMessageThinkingBlock.svelte | 68 + .../app/chat/ChatMessages/ChatMessageUser.svelte | 163 + .../app/chat/ChatMessages/ChatMessages.svelte | 143 + .../app/chat/ChatScreen/ChatScreen.svelte | 617 ++ .../chat/ChatScreen/ChatScreenDragOverlay.svelte | 17 + .../app/chat/ChatScreen/ChatScreenHeader.svelte | 28 + .../ChatScreen/ChatScreenProcessingInfo.svelte | 120 + .../app/chat/ChatSettings/ChatSettings.svelte | 508 ++ .../chat/ChatSettings/ChatSettingsFields.svelte | 255 + .../chat/ChatSettings/ChatSettingsFooter.svelte | 59 + .../ChatSettingsImportExportTab.svelte | 317 + .../ChatSettingsParameterSourceIndicator.svelte | 18 + .../app/chat/ChatSidebar/ChatSidebar.svelte | 211 + .../app/chat/ChatSidebar/ChatSidebarActions.svelte | 81 + .../ChatSidebar/ChatSidebarConversationItem.svelte | 200 + .../app/chat/ChatSidebar/ChatSidebarSearch.svelte | 19 + .../handle-mobile-sidebar-item-click.ts | 9 + .../app/dialogs/DialogChatAttachmentPreview.svelte | 67 + .../dialogs/DialogChatAttachmentsViewAll.svelte | 54 + .../components/app/dialogs/DialogChatError.svelte | 70 + .../app/dialogs/DialogChatSettings.svelte | 37 + .../app/dialogs/DialogConfirmation.svelte | 72 + .../app/dialogs/DialogConversationSelection.svelte | 68 + .../dialogs/DialogConversationTitleUpdate.svelte | 46 + .../app/dialogs/DialogEmptyFileAlert.svelte | 61 + .../app/dialogs/DialogModelInformation.svelte | 211 + .../app/dialogs/DialogModelNotAvailable.svelte | 76 + .../server/webui/src/lib/components/app/index.ts | 75 + .../lib/components/app/misc/ActionButton.svelte | 47 + .../lib/components/app/misc/ActionDropdown.svelte | 86 + .../components/app/misc/BadgeChatStatistic.svelte | 44 + .../src/lib/components/app/misc/BadgeInfo.svelte | 27 + .../lib/components/app/misc/BadgeModality.svelte | 39 + .../components/app/misc/CodePreviewDialog.svelte | 93 + .../app/misc/ConversationSelection.svelte | 205 + .../components/app/misc/CopyToClipboardIcon.svelte | 18 + .../app/misc/KeyboardShortcutInfo.svelte | 31 + .../lib/components/app/misc/MarkdownContent.svelte | 870 ++ .../lib/components/app/misc/RemoveButton.svelte | 26 + .../src/lib/components/app/misc/SearchInput.svelte | 73 + .../app/misc/SyntaxHighlightedCode.svelte | 97 + .../lib/components/app/models/ModelBadge.svelte | 56 + .../components/app/models/ModelsSelector.svelte | 555 ++ .../components/app/server/ServerErrorSplash.svelte | 282 + .../app/server/ServerLoadingSplash.svelte | 33 + .../lib/components/app/server/ServerStatus.svelte | 65 + .../ui/alert-dialog/alert-dialog-action.svelte | 18 + .../ui/alert-dialog/alert-dialog-cancel.svelte | 18 + .../ui/alert-dialog/alert-dialog-content.svelte | 35 + .../alert-dialog/alert-dialog-description.svelte | 17 + .../ui/alert-dialog/alert-dialog-footer.svelte | 23 + .../ui/alert-dialog/alert-dialog-header.svelte | 20 + .../ui/alert-dialog/alert-dialog-overlay.svelte | 20 + .../ui/alert-dialog/alert-dialog-title.svelte | 17 + .../ui/alert-dialog/alert-dialog-trigger.svelte | 7 + .../src/lib/components/ui/alert-dialog/index.ts | 39 + .../components/ui/alert/alert-description.svelte | 23 + .../src/lib/components/ui/alert/alert-title.svelte | 20 + .../webui/src/lib/components/ui/alert/alert.svelte | 44 + .../webui/src/lib/components/ui/alert/index.ts | 14 + .../webui/src/lib/components/ui/badge/badge.svelte | 49 + .../webui/src/lib/components/ui/badge/index.ts | 2 + .../src/lib/components/ui/button/button.svelte | 87 + .../webui/src/lib/components/ui/button/index.ts | 17 + .../src/lib/components/ui/card/card-action.svelte | 20 + .../src/lib/components/ui/card/card-content.svelte | 15 + .../lib/components/ui/card/card-description.svelte | 20 + .../src/lib/components/ui/card/card-footer.svelte | 20 + .../src/lib/components/ui/card/card-header.svelte | 23 + .../src/lib/components/ui/card/card-title.svelte | 20 + .../webui/src/lib/components/ui/card/card.svelte | 23 + .../webui/src/lib/components/ui/card/index.ts | 25 + .../src/lib/components/ui/checkbox/checkbox.svelte | 36 + .../webui/src/lib/components/ui/checkbox/index.ts | 6 + .../ui/collapsible/collapsible-content.svelte | 7 + .../ui/collapsible/collapsible-trigger.svelte | 7 + .../components/ui/collapsible/collapsible.svelte | 11 + .../src/lib/components/ui/collapsible/index.ts | 13 + .../lib/components/ui/dialog/dialog-close.svelte | 7 + .../lib/components/ui/dialog/dialog-content.svelte | 43 + .../components/ui/dialog/dialog-description.svelte | 17 + .../lib/components/ui/dialog/dialog-footer.svelte | 20 + .../lib/components/ui/dialog/dialog-header.svelte | 20 + .../lib/components/ui/dialog/dialog-overlay.svelte | 20 + .../lib/components/ui/dialog/dialog-title.svelte | 17 + .../lib/components/ui/dialog/dialog-trigger.svelte | 7 + .../webui/src/lib/components/ui/dialog/index.ts | 37 + .../dropdown-menu-checkbox-item.svelte | 41 + .../ui/dropdown-menu/dropdown-menu-content.svelte | 27 + .../dropdown-menu-group-heading.svelte | 22 + .../ui/dropdown-menu/dropdown-menu-group.svelte | 7 + .../ui/dropdown-menu/dropdown-menu-item.svelte | 27 + .../ui/dropdown-menu/dropdown-menu-label.svelte | 24 + .../dropdown-menu/dropdown-menu-radio-group.svelte | 16 + .../dropdown-menu/dropdown-menu-radio-item.svelte | 31 + .../dropdown-menu/dropdown-menu-separator.svelte | 17 + .../ui/dropdown-menu/dropdown-menu-shortcut.svelte | 20 + .../dropdown-menu/dropdown-menu-sub-content.svelte | 20 + .../dropdown-menu/dropdown-menu-sub-trigger.svelte | 29 + .../ui/dropdown-menu/dropdown-menu-trigger.svelte | 7 + .../src/lib/components/ui/dropdown-menu/index.ts | 49 + .../webui/src/lib/components/ui/input/index.ts | 7 + .../webui/src/lib/components/ui/input/input.svelte | 51 + .../webui/src/lib/components/ui/label/index.ts | 7 + .../webui/src/lib/components/ui/label/label.svelte | 20 + .../webui/src/lib/components/ui/popover/index.ts | 19 + .../lib/components/ui/popover/popover-close.svelte | 7 + .../components/ui/popover/popover-content.svelte | 37 + .../components/ui/popover/popover-portal.svelte | 7 + .../components/ui/popover/popover-trigger.svelte | 17 + .../src/lib/components/ui/popover/popover.svelte | 7 + .../src/lib/components/ui/scroll-area/index.ts | 10 + .../ui/scroll-area/scroll-area-scrollbar.svelte | 31 + .../components/ui/scroll-area/scroll-area.svelte | 40 + .../webui/src/lib/components/ui/select/index.ts | 37 + .../lib/components/ui/select/select-content.svelte | 111 + .../ui/select/select-group-heading.svelte | 21 + .../lib/components/ui/select/select-group.svelte | 7 + .../lib/components/ui/select/select-item.svelte | 38 + .../lib/components/ui/select/select-label.svelte | 20 + .../ui/select/select-scroll-down-button.svelte | 20 + .../ui/select/select-scroll-up-button.svelte | 20 + .../components/ui/select/select-separator.svelte | 18 + .../lib/components/ui/select/select-trigger.svelte | 40 + .../webui/src/lib/components/ui/separator/index.ts | 7 + .../lib/components/ui/separator/separator.svelte | 20 + .../webui/src/lib/components/ui/sheet/index.ts | 36 + .../src/lib/components/ui/sheet/sheet-close.svelte | 7 + .../lib/components/ui/sheet/sheet-content.svelte | 60 + .../components/ui/sheet/sheet-description.svelte | 17 + .../lib/components/ui/sheet/sheet-footer.svelte | 20 + .../lib/components/ui/sheet/sheet-header.svelte | 20 + .../lib/components/ui/sheet/sheet-overlay.svelte | 20 + .../src/lib/components/ui/sheet/sheet-title.svelte | 17 + .../lib/components/ui/sheet/sheet-trigger.svelte | 7 + .../src/lib/components/ui/sidebar/constants.ts | 6 + .../lib/components/ui/sidebar/context.svelte.ts | 79 + .../webui/src/lib/components/ui/sidebar/index.ts | 75 + .../components/ui/sidebar/sidebar-content.svelte | 24 + .../components/ui/sidebar/sidebar-footer.svelte | 21 + .../ui/sidebar/sidebar-group-action.svelte | 36 + .../ui/sidebar/sidebar-group-content.svelte | 21 + .../ui/sidebar/sidebar-group-label.svelte | 34 + .../lib/components/ui/sidebar/sidebar-group.svelte | 21 + .../components/ui/sidebar/sidebar-header.svelte | 21 + .../lib/components/ui/sidebar/sidebar-input.svelte | 21 + .../lib/components/ui/sidebar/sidebar-inset.svelte | 24 + .../ui/sidebar/sidebar-menu-action.svelte | 43 + .../ui/sidebar/sidebar-menu-badge.svelte | 29 + .../ui/sidebar/sidebar-menu-button.svelte | 106 + .../components/ui/sidebar/sidebar-menu-item.svelte | 21 + .../ui/sidebar/sidebar-menu-skeleton.svelte | 36 + .../ui/sidebar/sidebar-menu-sub-button.svelte | 43 + .../ui/sidebar/sidebar-menu-sub-item.svelte | 21 + .../components/ui/sidebar/sidebar-menu-sub.svelte | 25 + .../lib/components/ui/sidebar/sidebar-menu.svelte | 21 + .../components/ui/sidebar/sidebar-provider.svelte | 50 + .../lib/components/ui/sidebar/sidebar-rail.svelte | 36 + .../components/ui/sidebar/sidebar-separator.svelte | 19 + .../components/ui/sidebar/sidebar-trigger.svelte | 35 + .../src/lib/components/ui/sidebar/sidebar.svelte | 101 + .../webui/src/lib/components/ui/skeleton/index.ts | 7 + .../src/lib/components/ui/skeleton/skeleton.svelte | 17 + .../webui/src/lib/components/ui/switch/index.ts | 7 + .../src/lib/components/ui/switch/switch.svelte | 29 + .../webui/src/lib/components/ui/table/index.ts | 28 + .../src/lib/components/ui/table/table-body.svelte | 20 + .../lib/components/ui/table/table-caption.svelte | 20 + .../src/lib/components/ui/table/table-cell.svelte | 23 + .../lib/components/ui/table/table-footer.svelte | 20 + .../src/lib/components/ui/table/table-head.svelte | 23 + .../lib/components/ui/table/table-header.svelte | 20 + .../src/lib/components/ui/table/table-row.svelte | 23 + .../webui/src/lib/components/ui/table/table.svelte | 22 + .../webui/src/lib/components/ui/textarea/index.ts | 7 + .../src/lib/components/ui/textarea/textarea.svelte | 22 + .../webui/src/lib/components/ui/tooltip/index.ts | 21 + .../components/ui/tooltip/tooltip-content.svelte | 47 + .../components/ui/tooltip/tooltip-trigger.svelte | 7 + .../server/webui/src/lib/components/ui/utils.ts | 13 + .../server/webui/src/lib/constants/auto-scroll.ts | 3 + .../webui/src/lib/constants/binary-detection.ts | 14 + .../webui/src/lib/constants/default-context.ts | 1 + .../src/lib/constants/floating-ui-constraints.ts | 2 + .../tools/server/webui/src/lib/constants/icons.ts | 32 + .../webui/src/lib/constants/input-classes.ts | 6 + .../webui/src/lib/constants/latex-protection.ts | 35 + .../server/webui/src/lib/constants/literal-html.ts | 15 + .../webui/src/lib/constants/localstorage-keys.ts | 2 + .../webui/src/lib/constants/max-bundle-size.ts | 1 + .../server/webui/src/lib/constants/precision.ts | 2 + .../webui/src/lib/constants/processing-info.ts | 1 + .../webui/src/lib/constants/settings-config.ts | 117 + .../src/lib/constants/supported-file-types.ts | 217 + .../webui/src/lib/constants/table-html-restorer.ts | 20 + .../webui/src/lib/constants/tooltip-config.ts | 1 + .../server/webui/src/lib/constants/viewport.ts | 1 + .../tools/server/webui/src/lib/enums/attachment.ts | 10 + llama.cpp/tools/server/webui/src/lib/enums/chat.ts | 4 + .../tools/server/webui/src/lib/enums/files.ts | 206 + .../tools/server/webui/src/lib/enums/index.ts | 23 + .../tools/server/webui/src/lib/enums/model.ts | 5 + .../tools/server/webui/src/lib/enums/server.ts | 20 + .../server/webui/src/lib/hooks/is-mobile.svelte.ts | 8 + .../hooks/use-model-change-validation.svelte.ts | 118 + .../src/lib/hooks/use-processing-state.svelte.ts | 262 + .../webui/src/lib/markdown/enhance-code-blocks.ts | 162 + .../server/webui/src/lib/markdown/enhance-links.ts | 33 + .../server/webui/src/lib/markdown/literal-html.ts | 121 + .../webui/src/lib/markdown/table-html-restorer.ts | 181 + .../tools/server/webui/src/lib/services/chat.ts | 784 ++ .../server/webui/src/lib/services/database.ts | 400 + .../tools/server/webui/src/lib/services/index.ts | 5 + .../tools/server/webui/src/lib/services/models.ts | 124 + .../webui/src/lib/services/parameter-sync.spec.ts | 148 + .../webui/src/lib/services/parameter-sync.ts | 279 + .../tools/server/webui/src/lib/services/props.ts | 77 + .../server/webui/src/lib/stores/chat.svelte.ts | 1487 ++++ .../webui/src/lib/stores/conversations.svelte.ts | 662 ++ .../server/webui/src/lib/stores/models.svelte.ts | 605 ++ .../webui/src/lib/stores/persisted.svelte.ts | 50 + .../server/webui/src/lib/stores/server.svelte.ts | 140 + .../server/webui/src/lib/stores/settings.svelte.ts | 421 + .../tools/server/webui/src/lib/types/api.d.ts | 430 + .../tools/server/webui/src/lib/types/chat.d.ts | 55 + .../tools/server/webui/src/lib/types/database.d.ts | 85 + .../tools/server/webui/src/lib/types/index.ts | 70 + .../tools/server/webui/src/lib/types/models.d.ts | 21 + .../tools/server/webui/src/lib/types/settings.d.ts | 67 + .../server/webui/src/lib/utils/api-headers.ts | 22 + .../webui/src/lib/utils/api-key-validation.ts | 45 + .../webui/src/lib/utils/attachment-display.ts | 61 + .../server/webui/src/lib/utils/attachment-type.ts | 105 + .../server/webui/src/lib/utils/audio-recording.ts | 226 + .../webui/src/lib/utils/autoresize-textarea.ts | 10 + .../tools/server/webui/src/lib/utils/branching.ts | 283 + .../server/webui/src/lib/utils/browser-only.ts | 35 + .../tools/server/webui/src/lib/utils/clipboard.ts | 259 + .../server/webui/src/lib/utils/config-helpers.ts | 51 + .../webui/src/lib/utils/conversation-utils.ts | 30 + .../webui/src/lib/utils/convert-files-to-extra.ts | 192 + .../server/webui/src/lib/utils/file-preview.ts | 36 + .../tools/server/webui/src/lib/utils/file-type.ts | 222 + .../tools/server/webui/src/lib/utils/formatters.ts | 53 + .../tools/server/webui/src/lib/utils/index.ts | 95 + .../server/webui/src/lib/utils/is-ime-composing.ts | 5 + .../server/webui/src/lib/utils/latex-protection.ts | 270 + .../src/lib/utils/modality-file-validation.ts | 162 + .../server/webui/src/lib/utils/model-names.ts | 56 + .../server/webui/src/lib/utils/pdf-processing.ts | 150 + .../server/webui/src/lib/utils/portal-to-body.ts | 20 + .../tools/server/webui/src/lib/utils/precision.ts | 25 + .../webui/src/lib/utils/process-uploaded-files.ts | 136 + .../tools/server/webui/src/lib/utils/svg-to-png.ts | 71 + .../src/lib/utils/syntax-highlight-language.ts | 145 + .../tools/server/webui/src/lib/utils/text-files.ts | 97 + llama.cpp/tools/server/webui/src/lib/utils/text.ts | 7 + .../server/webui/src/lib/utils/webp-to-png.ts | 73 + .../tools/server/webui/src/routes/+error.svelte | 70 + .../tools/server/webui/src/routes/+layout.svelte | 223 + .../tools/server/webui/src/routes/+page.svelte | 91 + llama.cpp/tools/server/webui/src/routes/+page.ts | 6 + .../server/webui/src/routes/chat/[id]/+page.svelte | 176 + .../server/webui/src/routes/chat/[id]/+page.ts | 6 + .../server/webui/src/styles/katex-custom.scss | 13 + llama.cpp/tools/server/webui/static/favicon.svg | 1 + llama.cpp/tools/server/webui/static/loading.html | 12 + llama.cpp/tools/server/webui/svelte.config.js | 34 + .../tests/client/components/TestWrapper.svelte | 17 + .../server/webui/tests/client/page.svelte.test.ts | 11 + .../tools/server/webui/tests/e2e/demo.test.ts | 6 + .../webui/tests/stories/ChatForm.stories.svelte | 161 + .../webui/tests/stories/ChatMessage.stories.svelte | 207 + .../tests/stories/ChatSettings.stories.svelte | 19 + .../webui/tests/stories/ChatSidebar.stories.svelte | 97 + .../server/webui/tests/stories/Introduction.mdx | 44 + .../tests/stories/MarkdownContent.stories.svelte | 130 + .../webui/tests/stories/fixtures/ai-tutorial.ts | 164 + .../webui/tests/stories/fixtures/api-docs.ts | 160 + .../webui/tests/stories/fixtures/assets/1.jpg | Bin 0 -> 44891 bytes .../fixtures/assets/beautiful-flowers-lotus.webp | Bin 0 -> 817630 bytes .../tests/stories/fixtures/assets/example.pdf | Bin 0 -> 351048 bytes .../tests/stories/fixtures/assets/hf-logo.svg | 8 + .../webui/tests/stories/fixtures/blog-post.ts | 125 + .../webui/tests/stories/fixtures/data-analysis.ts | 124 + .../server/webui/tests/stories/fixtures/empty.ts | 2 + .../webui/tests/stories/fixtures/math-formulas.ts | 221 + .../server/webui/tests/stories/fixtures/readme.ts | 136 + .../tests/stories/fixtures/storybook-mocks.ts | 81 + .../server/webui/tests/unit/clipboard.test.ts | 423 + .../webui/tests/unit/latex-protection.test.ts | 376 + .../server/webui/tests/unit/model-names.test.ts | 51 + llama.cpp/tools/server/webui/tsconfig.json | 19 + llama.cpp/tools/server/webui/vite.config.ts | 166 + .../tools/server/webui/vitest-setup-client.ts | 2 + llama.cpp/tools/tokenize/CMakeLists.txt | 7 + llama.cpp/tools/tokenize/tokenize.cpp | 416 + llama.cpp/tools/tts/CMakeLists.txt | 8 + llama.cpp/tools/tts/README.md | 117 + llama.cpp/tools/tts/convert_pt_to_hf.py | 180 + llama.cpp/tools/tts/tts-outetts.py | 299 + llama.cpp/tools/tts/tts.cpp | 1093 +++ 527 files changed, 98897 insertions(+) create mode 100644 llama.cpp/tools/CMakeLists.txt create mode 100644 llama.cpp/tools/batched-bench/CMakeLists.txt create mode 100644 llama.cpp/tools/batched-bench/README.md create mode 100644 llama.cpp/tools/batched-bench/batched-bench.cpp create mode 100644 llama.cpp/tools/cli/CMakeLists.txt create mode 100644 llama.cpp/tools/cli/README.md create mode 100644 llama.cpp/tools/cli/cli.cpp create mode 100644 llama.cpp/tools/completion/CMakeLists.txt create mode 100644 llama.cpp/tools/completion/README.md create mode 100644 llama.cpp/tools/completion/completion.cpp create mode 100644 llama.cpp/tools/cvector-generator/CMakeLists.txt create mode 100644 llama.cpp/tools/cvector-generator/README.md create mode 100644 llama.cpp/tools/cvector-generator/completions.txt create mode 100644 llama.cpp/tools/cvector-generator/cvector-generator.cpp create mode 100644 llama.cpp/tools/cvector-generator/mean.hpp create mode 100644 llama.cpp/tools/cvector-generator/negative.txt create mode 100644 llama.cpp/tools/cvector-generator/pca.hpp create mode 100644 llama.cpp/tools/cvector-generator/positive.txt create mode 100644 llama.cpp/tools/export-lora/CMakeLists.txt create mode 100644 llama.cpp/tools/export-lora/README.md create mode 100644 llama.cpp/tools/export-lora/export-lora.cpp create mode 100644 llama.cpp/tools/fit-params/CMakeLists.txt create mode 100644 llama.cpp/tools/fit-params/README.md create mode 100644 llama.cpp/tools/fit-params/fit-params.cpp create mode 100644 llama.cpp/tools/gguf-split/CMakeLists.txt create mode 100644 llama.cpp/tools/gguf-split/README.md create mode 100644 llama.cpp/tools/gguf-split/gguf-split.cpp create mode 100755 llama.cpp/tools/gguf-split/tests.sh create mode 100644 llama.cpp/tools/imatrix/CMakeLists.txt create mode 100644 llama.cpp/tools/imatrix/README.md create mode 100644 llama.cpp/tools/imatrix/imatrix.cpp create mode 100644 llama.cpp/tools/llama-bench/CMakeLists.txt create mode 100644 llama.cpp/tools/llama-bench/README.md create mode 100644 llama.cpp/tools/llama-bench/llama-bench.cpp create mode 100644 llama.cpp/tools/mtmd/CMakeLists.txt create mode 100644 llama.cpp/tools/mtmd/README.md create mode 100644 llama.cpp/tools/mtmd/clip-graph.h create mode 100644 llama.cpp/tools/mtmd/clip-impl.h create mode 100644 llama.cpp/tools/mtmd/clip-model.h create mode 100644 llama.cpp/tools/mtmd/clip.cpp create mode 100644 llama.cpp/tools/mtmd/clip.h create mode 100644 llama.cpp/tools/mtmd/deprecation-warning.cpp create mode 100644 llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py create mode 100644 llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py create mode 100644 llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py create mode 100644 llama.cpp/tools/mtmd/legacy-models/llava_surgery.py create mode 100644 llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py create mode 100644 llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py create mode 100644 llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py create mode 100644 llama.cpp/tools/mtmd/models/cogvlm.cpp create mode 100644 llama.cpp/tools/mtmd/models/conformer.cpp create mode 100644 llama.cpp/tools/mtmd/models/glm4v.cpp create mode 100644 llama.cpp/tools/mtmd/models/internvl.cpp create mode 100644 llama.cpp/tools/mtmd/models/kimik25.cpp create mode 100644 llama.cpp/tools/mtmd/models/kimivl.cpp create mode 100644 llama.cpp/tools/mtmd/models/llama4.cpp create mode 100644 llama.cpp/tools/mtmd/models/llava.cpp create mode 100644 llama.cpp/tools/mtmd/models/minicpmv.cpp create mode 100644 llama.cpp/tools/mtmd/models/mobilenetv5.cpp create mode 100644 llama.cpp/tools/mtmd/models/models.h create mode 100644 llama.cpp/tools/mtmd/models/pixtral.cpp create mode 100644 llama.cpp/tools/mtmd/models/qwen2vl.cpp create mode 100644 llama.cpp/tools/mtmd/models/qwen3vl.cpp create mode 100644 llama.cpp/tools/mtmd/models/siglip.cpp create mode 100644 llama.cpp/tools/mtmd/models/whisper-enc.cpp create mode 100644 llama.cpp/tools/mtmd/models/youtuvl.cpp create mode 100644 llama.cpp/tools/mtmd/mtmd-audio.cpp create mode 100644 llama.cpp/tools/mtmd/mtmd-audio.h create mode 100644 llama.cpp/tools/mtmd/mtmd-cli.cpp create mode 100644 llama.cpp/tools/mtmd/mtmd-helper.cpp create mode 100644 llama.cpp/tools/mtmd/mtmd-helper.h create mode 100644 llama.cpp/tools/mtmd/mtmd.cpp create mode 100644 llama.cpp/tools/mtmd/mtmd.h create mode 100644 llama.cpp/tools/mtmd/requirements.txt create mode 100644 llama.cpp/tools/mtmd/test-1.jpeg create mode 100644 llama.cpp/tools/mtmd/test-2.mp3 create mode 100755 llama.cpp/tools/mtmd/tests.sh create mode 100644 llama.cpp/tools/perplexity/CMakeLists.txt create mode 100644 llama.cpp/tools/perplexity/README.md create mode 100644 llama.cpp/tools/perplexity/perplexity.cpp create mode 100644 llama.cpp/tools/quantize/CMakeLists.txt create mode 100644 llama.cpp/tools/quantize/README.md create mode 100644 llama.cpp/tools/quantize/quantize.cpp create mode 100644 llama.cpp/tools/quantize/tests.sh create mode 100644 llama.cpp/tools/rpc/CMakeLists.txt create mode 100644 llama.cpp/tools/rpc/README.md create mode 100644 llama.cpp/tools/rpc/rpc-server.cpp create mode 100644 llama.cpp/tools/server/CMakeLists.txt create mode 100644 llama.cpp/tools/server/README-dev.md create mode 100644 llama.cpp/tools/server/README.md create mode 100644 llama.cpp/tools/server/bench/README.md create mode 100644 llama.cpp/tools/server/bench/bench.py create mode 100644 llama.cpp/tools/server/bench/prometheus.yml create mode 100644 llama.cpp/tools/server/bench/requirements.txt create mode 100644 llama.cpp/tools/server/bench/script.js create mode 100755 llama.cpp/tools/server/chat-llama2.sh create mode 100644 llama.cpp/tools/server/chat.mjs create mode 100755 llama.cpp/tools/server/chat.sh create mode 100644 llama.cpp/tools/server/public/index.html.gz create mode 100644 llama.cpp/tools/server/public/loading.html create mode 100755 llama.cpp/tools/server/public_legacy/colorthemes.css create mode 100644 llama.cpp/tools/server/public_legacy/completion.js create mode 100644 llama.cpp/tools/server/public_legacy/favicon.ico create mode 100644 llama.cpp/tools/server/public_legacy/index-new.html create mode 100644 llama.cpp/tools/server/public_legacy/index.html create mode 100644 llama.cpp/tools/server/public_legacy/index.js create mode 100644 llama.cpp/tools/server/public_legacy/json-schema-to-grammar.mjs create mode 100644 llama.cpp/tools/server/public_legacy/loading.html create mode 100644 llama.cpp/tools/server/public_legacy/prompt-formats.js create mode 100644 llama.cpp/tools/server/public_legacy/style.css create mode 100644 llama.cpp/tools/server/public_legacy/system-prompts.js create mode 100755 llama.cpp/tools/server/public_legacy/theme-beeninorder.css create mode 100755 llama.cpp/tools/server/public_legacy/theme-ketivah.css create mode 100755 llama.cpp/tools/server/public_legacy/theme-mangotango.css create mode 100755 llama.cpp/tools/server/public_legacy/theme-playground.css create mode 100755 llama.cpp/tools/server/public_legacy/theme-polarnight.css create mode 100755 llama.cpp/tools/server/public_legacy/theme-snowstorm.css create mode 100644 llama.cpp/tools/server/public_simplechat/datautils.mjs create mode 100644 llama.cpp/tools/server/public_simplechat/index.html create mode 100644 llama.cpp/tools/server/public_simplechat/readme.md create mode 100644 llama.cpp/tools/server/public_simplechat/simplechat.css create mode 100644 llama.cpp/tools/server/public_simplechat/simplechat.js create mode 100644 llama.cpp/tools/server/public_simplechat/simplechat_screens.webp create mode 100644 llama.cpp/tools/server/public_simplechat/ui.mjs create mode 100644 llama.cpp/tools/server/server-common.cpp create mode 100644 llama.cpp/tools/server/server-common.h create mode 100644 llama.cpp/tools/server/server-context.cpp create mode 100644 llama.cpp/tools/server/server-context.h create mode 100644 llama.cpp/tools/server/server-http.cpp create mode 100644 llama.cpp/tools/server/server-http.h create mode 100644 llama.cpp/tools/server/server-models.cpp create mode 100644 llama.cpp/tools/server/server-models.h create mode 100644 llama.cpp/tools/server/server-queue.cpp create mode 100644 llama.cpp/tools/server/server-queue.h create mode 100644 llama.cpp/tools/server/server-task.cpp create mode 100644 llama.cpp/tools/server/server-task.h create mode 100644 llama.cpp/tools/server/server.cpp create mode 100644 llama.cpp/tools/server/tests/.gitignore create mode 100644 llama.cpp/tools/server/tests/README.md create mode 100644 llama.cpp/tools/server/tests/conftest.py create mode 100644 llama.cpp/tools/server/tests/pytest.ini create mode 100644 llama.cpp/tools/server/tests/requirements.txt create mode 100755 llama.cpp/tools/server/tests/tests.sh create mode 100644 llama.cpp/tools/server/tests/unit/test_basic.py create mode 100644 llama.cpp/tools/server/tests/unit/test_chat_completion.py create mode 100644 llama.cpp/tools/server/tests/unit/test_compat_anthropic.py create mode 100644 llama.cpp/tools/server/tests/unit/test_compat_oai_responses.py create mode 100644 llama.cpp/tools/server/tests/unit/test_completion.py create mode 100644 llama.cpp/tools/server/tests/unit/test_ctx_shift.py create mode 100644 llama.cpp/tools/server/tests/unit/test_embedding.py create mode 100644 llama.cpp/tools/server/tests/unit/test_infill.py create mode 100644 llama.cpp/tools/server/tests/unit/test_lora.py create mode 100644 llama.cpp/tools/server/tests/unit/test_rerank.py create mode 100644 llama.cpp/tools/server/tests/unit/test_router.py create mode 100644 llama.cpp/tools/server/tests/unit/test_security.py create mode 100644 llama.cpp/tools/server/tests/unit/test_sleep.py create mode 100644 llama.cpp/tools/server/tests/unit/test_slot_save.py create mode 100644 llama.cpp/tools/server/tests/unit/test_speculative.py create mode 100644 llama.cpp/tools/server/tests/unit/test_template.py create mode 100644 llama.cpp/tools/server/tests/unit/test_tokenize.py create mode 100755 llama.cpp/tools/server/tests/unit/test_tool_call.py create mode 100644 llama.cpp/tools/server/tests/unit/test_vision_api.py create mode 100644 llama.cpp/tools/server/tests/utils.py create mode 100644 llama.cpp/tools/server/themes/README.md create mode 100644 llama.cpp/tools/server/themes/buttons-top/README.md create mode 100644 llama.cpp/tools/server/themes/buttons-top/buttons_top.png create mode 100644 llama.cpp/tools/server/themes/buttons-top/favicon.ico create mode 100644 llama.cpp/tools/server/themes/buttons-top/index.html create mode 100644 llama.cpp/tools/server/themes/wild/README.md create mode 100644 llama.cpp/tools/server/themes/wild/favicon.ico create mode 100644 llama.cpp/tools/server/themes/wild/index.html create mode 100644 llama.cpp/tools/server/themes/wild/llama_cpp.png create mode 100644 llama.cpp/tools/server/themes/wild/llamapattern.png create mode 100644 llama.cpp/tools/server/themes/wild/wild.png create mode 100644 llama.cpp/tools/server/webui/.gitignore create mode 100644 llama.cpp/tools/server/webui/.npmrc create mode 100644 llama.cpp/tools/server/webui/.prettierignore create mode 100644 llama.cpp/tools/server/webui/.prettierrc create mode 100644 llama.cpp/tools/server/webui/.storybook/ModeWatcherDecorator.svelte create mode 100644 llama.cpp/tools/server/webui/.storybook/TooltipProviderDecorator.svelte create mode 100644 llama.cpp/tools/server/webui/.storybook/main.ts create mode 100644 llama.cpp/tools/server/webui/.storybook/preview.ts create mode 100644 llama.cpp/tools/server/webui/.storybook/vitest.setup.ts create mode 100644 llama.cpp/tools/server/webui/README.md create mode 100644 llama.cpp/tools/server/webui/components.json create mode 100644 llama.cpp/tools/server/webui/docs/architecture/high-level-architecture-simplified.md create mode 100644 llama.cpp/tools/server/webui/docs/architecture/high-level-architecture.md create mode 100644 llama.cpp/tools/server/webui/docs/flows/chat-flow.md create mode 100644 llama.cpp/tools/server/webui/docs/flows/conversations-flow.md create mode 100644 llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-model-mode.md create mode 100644 llama.cpp/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md create mode 100644 llama.cpp/tools/server/webui/docs/flows/database-flow.md create mode 100644 llama.cpp/tools/server/webui/docs/flows/models-flow.md create mode 100644 llama.cpp/tools/server/webui/docs/flows/server-flow.md create mode 100644 llama.cpp/tools/server/webui/docs/flows/settings-flow.md create mode 100644 llama.cpp/tools/server/webui/eslint.config.js create mode 100644 llama.cpp/tools/server/webui/package-lock.json create mode 100644 llama.cpp/tools/server/webui/package.json create mode 100644 llama.cpp/tools/server/webui/playwright.config.ts create mode 100644 llama.cpp/tools/server/webui/scripts/dev.sh create mode 100755 llama.cpp/tools/server/webui/scripts/install-git-hooks.sh create mode 100755 llama.cpp/tools/server/webui/scripts/post-build.sh create mode 100644 llama.cpp/tools/server/webui/src/app.css create mode 100644 llama.cpp/tools/server/webui/src/app.d.ts create mode 100644 llama.cpp/tools/server/webui/src/app.html create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentPreview.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailFile.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentThumbnailImage.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsViewAll.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionFileAttachments.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionRecord.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionSubmit.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActions.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormFileInputInvisible.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormHelperText.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatFormTextarea.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageActions.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageBranchingControls.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageSystem.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageThinkingBlock.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenDragOverlay.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettings.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFooter.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsParameterSourceIndicator.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebar.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarActions.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarSearch.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/chat/ChatSidebar/handle-mobile-sidebar-item-click.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentPreview.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatAttachmentsViewAll.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatError.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogChatSettings.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConfirmation.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationSelection.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogConversationTitleUpdate.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogEmptyFileAlert.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/dialogs/DialogModelNotAvailable.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionButton.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/ActionDropdown.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeInfo.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/BadgeModality.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/CodePreviewDialog.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/ConversationSelection.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/CopyToClipboardIcon.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/KeyboardShortcutInfo.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/RemoveButton.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/SearchInput.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/misc/SyntaxHighlightedCode.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/models/ModelBadge.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/models/ModelsSelector.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/server/ServerLoadingSplash.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/app/server/ServerStatus.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-action.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-cancel.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-description.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-footer.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-header.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-overlay.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-title.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/alert-dialog-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert-dialog/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-description.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert-title.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert/alert.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/alert/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/badge/badge.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/badge/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/button/button.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/button/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-action.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-description.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-footer.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-header.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card-title.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/card.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/card/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/checkbox.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/checkbox/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/collapsible.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/collapsible/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-close.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-description.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-footer.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-header.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-overlay.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-title.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/dialog-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dialog/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-checkbox-item.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group-heading.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-group.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-item.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-label.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-group.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-radio-item.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-separator.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-shortcut.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-sub-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/dropdown-menu-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/dropdown-menu/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/input/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/input/input.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/label/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/label/label.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-close.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-portal.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/popover/popover.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area-scrollbar.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/scroll-area/scroll-area.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group-heading.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-group.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-item.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-label.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-down-button.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-scroll-up-button.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-separator.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/select/select-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/separator/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/separator/separator.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-close.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-description.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-footer.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-header.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-overlay.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-title.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sheet/sheet-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/constants.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/context.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-footer.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-action.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group-label.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-group.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-header.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-input.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-inset.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-action.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-badge.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-button.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-item.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-skeleton.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-button.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub-item.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu-sub.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-menu.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-provider.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-rail.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-separator.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/sidebar/sidebar.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/skeleton/skeleton.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/switch/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/switch/switch.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-body.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-caption.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-cell.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-footer.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-head.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-header.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table-row.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/table/table.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/textarea/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/textarea/textarea.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-content.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/tooltip/tooltip-trigger.svelte create mode 100644 llama.cpp/tools/server/webui/src/lib/components/ui/utils.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/auto-scroll.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/binary-detection.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/default-context.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/floating-ui-constraints.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/icons.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/input-classes.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/latex-protection.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/literal-html.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/localstorage-keys.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/max-bundle-size.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/precision.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/processing-info.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/settings-config.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/supported-file-types.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/table-html-restorer.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/tooltip-config.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/constants/viewport.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/attachment.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/chat.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/files.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/model.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/enums/server.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/hooks/is-mobile.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/hooks/use-model-change-validation.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/hooks/use-processing-state.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/markdown/enhance-links.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/markdown/literal-html.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/markdown/table-html-restorer.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/services/chat.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/services/database.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/services/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/services/models.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/services/parameter-sync.spec.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/services/parameter-sync.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/services/props.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/chat.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/conversations.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/models.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/persisted.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/server.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/stores/settings.svelte.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/types/api.d.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/types/chat.d.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/types/database.d.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/types/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/types/models.d.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/types/settings.d.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/api-headers.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/api-key-validation.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/attachment-display.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/attachment-type.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/audio-recording.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/autoresize-textarea.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/branching.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/browser-only.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/clipboard.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/config-helpers.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/conversation-utils.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/convert-files-to-extra.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/file-preview.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/file-type.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/formatters.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/index.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/is-ime-composing.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/latex-protection.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/modality-file-validation.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/model-names.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/pdf-processing.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/portal-to-body.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/precision.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/process-uploaded-files.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/svg-to-png.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/syntax-highlight-language.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/text-files.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/text.ts create mode 100644 llama.cpp/tools/server/webui/src/lib/utils/webp-to-png.ts create mode 100644 llama.cpp/tools/server/webui/src/routes/+error.svelte create mode 100644 llama.cpp/tools/server/webui/src/routes/+layout.svelte create mode 100644 llama.cpp/tools/server/webui/src/routes/+page.svelte create mode 100644 llama.cpp/tools/server/webui/src/routes/+page.ts create mode 100644 llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.svelte create mode 100644 llama.cpp/tools/server/webui/src/routes/chat/[id]/+page.ts create mode 100644 llama.cpp/tools/server/webui/src/styles/katex-custom.scss create mode 100644 llama.cpp/tools/server/webui/static/favicon.svg create mode 100644 llama.cpp/tools/server/webui/static/loading.html create mode 100644 llama.cpp/tools/server/webui/svelte.config.js create mode 100644 llama.cpp/tools/server/webui/tests/client/components/TestWrapper.svelte create mode 100644 llama.cpp/tools/server/webui/tests/client/page.svelte.test.ts create mode 100644 llama.cpp/tools/server/webui/tests/e2e/demo.test.ts create mode 100644 llama.cpp/tools/server/webui/tests/stories/ChatForm.stories.svelte create mode 100644 llama.cpp/tools/server/webui/tests/stories/ChatMessage.stories.svelte create mode 100644 llama.cpp/tools/server/webui/tests/stories/ChatSettings.stories.svelte create mode 100644 llama.cpp/tools/server/webui/tests/stories/ChatSidebar.stories.svelte create mode 100644 llama.cpp/tools/server/webui/tests/stories/Introduction.mdx create mode 100644 llama.cpp/tools/server/webui/tests/stories/MarkdownContent.stories.svelte create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/ai-tutorial.ts create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/api-docs.ts create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/assets/1.jpg create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/assets/beautiful-flowers-lotus.webp create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/assets/example.pdf create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/assets/hf-logo.svg create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/blog-post.ts create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/data-analysis.ts create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/empty.ts create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/math-formulas.ts create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/readme.ts create mode 100644 llama.cpp/tools/server/webui/tests/stories/fixtures/storybook-mocks.ts create mode 100644 llama.cpp/tools/server/webui/tests/unit/clipboard.test.ts create mode 100644 llama.cpp/tools/server/webui/tests/unit/latex-protection.test.ts create mode 100644 llama.cpp/tools/server/webui/tests/unit/model-names.test.ts create mode 100644 llama.cpp/tools/server/webui/tsconfig.json create mode 100644 llama.cpp/tools/server/webui/vite.config.ts create mode 100644 llama.cpp/tools/server/webui/vitest-setup-client.ts create mode 100644 llama.cpp/tools/tokenize/CMakeLists.txt create mode 100644 llama.cpp/tools/tokenize/tokenize.cpp create mode 100644 llama.cpp/tools/tts/CMakeLists.txt create mode 100644 llama.cpp/tools/tts/README.md create mode 100644 llama.cpp/tools/tts/convert_pt_to_hf.py create mode 100644 llama.cpp/tools/tts/tts-outetts.py create mode 100644 llama.cpp/tools/tts/tts.cpp (limited to 'llama.cpp/tools') diff --git a/llama.cpp/tools/CMakeLists.txt b/llama.cpp/tools/CMakeLists.txt new file mode 100644 index 0000000..518f8b9 --- /dev/null +++ b/llama.cpp/tools/CMakeLists.txt @@ -0,0 +1,40 @@ +# dependencies + +find_package(Threads REQUIRED) + +# third-party + +# ... + +# flags + +llama_add_compile_flags() + +# tools + +if (EMSCRIPTEN) +else() + add_subdirectory(batched-bench) + add_subdirectory(gguf-split) + add_subdirectory(imatrix) + add_subdirectory(llama-bench) + add_subdirectory(completion) + add_subdirectory(perplexity) + add_subdirectory(quantize) + if (LLAMA_BUILD_SERVER) + add_subdirectory(cli) + add_subdirectory(server) + endif() + add_subdirectory(tokenize) + add_subdirectory(tts) + add_subdirectory(mtmd) + if (GGML_RPC) + add_subdirectory(rpc) + endif() + if (NOT GGML_BACKEND_DL) + # these examples use the backends directly and cannot be built with dynamic loading + add_subdirectory(cvector-generator) + add_subdirectory(export-lora) + endif() + add_subdirectory(fit-params) +endif() diff --git a/llama.cpp/tools/batched-bench/CMakeLists.txt b/llama.cpp/tools/batched-bench/CMakeLists.txt new file mode 100644 index 0000000..4a46b57 --- /dev/null +++ b/llama.cpp/tools/batched-bench/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-batched-bench) +add_executable(${TARGET} batched-bench.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/llama.cpp/tools/batched-bench/README.md b/llama.cpp/tools/batched-bench/README.md new file mode 100644 index 0000000..df67c47 --- /dev/null +++ b/llama.cpp/tools/batched-bench/README.md @@ -0,0 +1,60 @@ +# llama.cpp/example/batched-bench + +Benchmark the batched decoding performance of `llama.cpp` + +## Usage + +There are 2 modes of operation: + +- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`) +- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) + +```bash +./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps] + +# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared +./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99 + +# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared +./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps + +# custom set of batches +./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 +``` + +## Sample results + +- `PP` - prompt tokens per batch +- `TG` - generated tokens per batch +- `B` - number of batches +- `N_KV` - required KV cache size +- `T_PP` - prompt processing time (i.e. time to first token) +- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`) +- `T_TG` - time to generate all batches +- `S_TG` - text generation speed (`(B*TG)/T_TG`) +- `T` - total time +- `S` - total speed (i.e. all tokens / total time) + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 | +| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 | +| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 | +| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 | +| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 | +| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 | +| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 | +| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 | +| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 | +| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 | +| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 | +| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 | + +### JSONL output + +Pass `--output-format jsonl` to output JSONL instead of Markdown, á la + +```json lines +{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094} +{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854} +``` diff --git a/llama.cpp/tools/batched-bench/batched-bench.cpp b/llama.cpp/tools/batched-bench/batched-bench.cpp new file mode 100644 index 0000000..0f627c5 --- /dev/null +++ b/llama.cpp/tools/batched-bench/batched-bench.cpp @@ -0,0 +1,256 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" + +#include +#include +#include +#include + +static void print_usage(int, char ** argv) { + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); + LOG("\n"); +} + +int main(int argc, char ** argv) { + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { + return 1; + } + + common_init(); + + int is_pp_shared = params.is_pp_shared; + int is_tg_separate = params.is_tg_separate; + + std::vector n_pp = params.n_pp; + std::vector n_tg = params.n_tg; + std::vector n_pl = params.n_pl; + + // init LLM + + llama_backend_init(); + llama_numa_init(params.numa); + + // initialize the model + + llama_model_params model_params = common_model_params_to_llama(params); + + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); + + if (model == NULL) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + llama_context_params ctx_params = common_context_params_to_llama(params); + + // ensure enough sequences are available + ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); + + llama_context * ctx = llama_init_from_model(model, ctx_params); + + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + llama_model_free(model); + return 1; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + + const auto get_token_rand = [n_vocab]() -> llama_token { + return std::rand() % n_vocab; + }; + + auto * mem = llama_get_memory(ctx); + + const int32_t n_kv_max = llama_n_ctx(ctx); + + llama_batch batch = llama_batch_init(n_kv_max, 0, 1); + + // decode in batches of ctx_params.n_batch tokens + auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch, bool synchronize) { + for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { + const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); + + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + }; + + const int ret = llama_decode(ctx, batch_view); + if (ret != 0) { + LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + return false; + } + + if (synchronize) { + llama_synchronize(ctx); + } + } + + return true; + }; + + // warm up + { + for (int i = 0; i < 16; ++i) { + common_batch_add(batch, get_token_rand(), i, { 0 }, false); + } + + if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + llama_free(ctx); + llama_model_free(model); + return 1; + } + } + + if (!params.batched_bench_output_jsonl) { + LOG("\n"); + LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, is_tg_separate = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, int(params.flash_attn_type), is_pp_shared, is_tg_separate, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG("\n"); + LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); + LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); + } + + for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { + for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) { + for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) { + const int pp = n_pp[i_pp]; + const int tg = n_tg[i_tg]; + const int pl = n_pl[i_pl]; + + const int n_ctx_req = is_pp_shared ? (params.kv_unified ? pp : pl*pp) + pl*tg : pl*(pp + tg); + + if (n_ctx_req > n_kv_max) { + continue; + } + + common_batch_clear(batch); + + for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { + for (int i = 0; i < pp; ++i) { + common_batch_add(batch, get_token_rand(), i, { j }, i == pp - 1); + } + } + + llama_memory_clear(mem, false); + + const auto t_pp_start = ggml_time_us(); + + if (!decode_helper(ctx, batch, ctx_params.n_batch, false)) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + llama_free(ctx); + llama_model_free(model); + return 1; + } + + llama_synchronize(ctx); + + const auto t_pp_end = ggml_time_us(); + + if (is_pp_shared) { + for (int32_t i = 1; i < pl; ++i) { + llama_memory_seq_cp(mem, 0, i, -1, -1); + } + + if (!params.kv_unified) { + // run one dummy token to apply the memory copy + common_batch_clear(batch); + common_batch_add(batch, get_token_rand(), pp + 0, { 0 }, true); + if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + llama_free(ctx); + llama_model_free(model); + return 1; + } + llama_memory_seq_rm(mem, 0, pp, -1); + } + } + + const auto t_tg_start = ggml_time_us(); + + if (is_tg_separate) { + // decode pattern: + // 0 0 0 ... 1 1 1 ... 2 2 2 ... 3 3 3 ... + for (int j = 0; j < pl; ++j) { + for (int i = 0; i < tg; ++i) { + common_batch_clear(batch); + + common_batch_add(batch, get_token_rand(), pp + i, { j }, true); + + if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + llama_free(ctx); + llama_model_free(model); + return 1; + } + } + } + } else { + // decode pattern: + // 0123 0123 0123 ... + for (int i = 0; i < tg; ++i) { + common_batch_clear(batch); + + for (int j = 0; j < pl; ++j) { + common_batch_add(batch, get_token_rand(), pp + i, { j }, true); + } + + if (!decode_helper(ctx, batch, ctx_params.n_batch, true)) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + llama_free(ctx); + llama_model_free(model); + return 1; + } + } + } + + const auto t_tg_end = ggml_time_us(); + + const int32_t n_kv = n_ctx_req; + + const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f; + const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f; + const float t = t_pp + t_tg; + + const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp; + const float speed_tg = pl*tg / t_tg; + const float speed = ((is_pp_shared ? pp : pl*pp) + pl*tg) / t; + + if(params.batched_bench_output_jsonl) { + LOG( + "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, " + "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n", + n_kv_max, params.n_batch, params.n_ubatch, int(params.flash_attn_type), params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch, + pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed + ); + } else { + LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); + } + } + } + } + + LOG("\n"); + llama_perf_context_print(ctx); + + llama_batch_free(batch); + + llama_free(ctx); + llama_model_free(model); + + llama_backend_free(); + + return 0; +} diff --git a/llama.cpp/tools/cli/CMakeLists.txt b/llama.cpp/tools/cli/CMakeLists.txt new file mode 100644 index 0000000..b08fff4 --- /dev/null +++ b/llama.cpp/tools/cli/CMakeLists.txt @@ -0,0 +1,10 @@ +set(TARGET llama-cli) +add_executable(${TARGET} cli.cpp) +target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +include_directories(../server) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/llama.cpp/tools/cli/README.md b/llama.cpp/tools/cli/README.md new file mode 100644 index 0000000..4a15cba --- /dev/null +++ b/llama.cpp/tools/cli/README.md @@ -0,0 +1,192 @@ +# llama.cpp/tools/cli + +## Usage + + + + + +### Common params + +| Argument | Explanation | +| -------- | ----------- | +| `-h, --help, --usage` | print usage and exit | +| `--version` | show version and build info | +| `--license` | show source code license and dependencies | +| `-cl, --cache-list` | show list of models in cache | +| `--completion-bash` | print source-able bash completion script for llama.cpp | +| `--verbose-prompt` | print a verbose prompt before generation (default: false) | +| `-t, --threads N` | number of CPU threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | +| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | +| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | +| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | +| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) | +| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) | +| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) | +| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | +| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | +| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | +| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) | +| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | +| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | +| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)
(env: LLAMA_ARG_N_PREDICT) | +| `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | +| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | +| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | +| `--swa-full` | use full-size SWA cache (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
(env: LLAMA_ARG_SWA_FULL) | +| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
(env: LLAMA_ARG_FLASH_ATTN) | +| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys | +| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)
(env: LLAMA_ARG_PERF) | +| `-f, --file FNAME` | a file containing the prompt (default: none) | +| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | +| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | +| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | +| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N
(env: LLAMA_ARG_ROPE_SCALE) | +| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | +| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | +| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | +| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | +| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | +| `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | +| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | +| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | +| `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | +| `--list-devices` | print list of available devices and exit | +| `-ot, --override-tensor =,...` | override tensor buffer type
(env: LLAMA_ARG_OVERRIDE_TENSOR) | +| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | +| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | +| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS) | +| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | +| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | +| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | +| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')
(env: LLAMA_ARG_FIT) | +| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024
(env: LLAMA_ARG_FIT_TARGET) | +| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096
(env: LLAMA_ARG_FIT_CTX) | +| `--check-tensors` | check model tensor data for invalid values (default: false) | +| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false | +| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) | +| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) | +| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)
note: use comma-separated values | +| `--control-vector FNAME` | add a control vector
note: use comma-separated values to add multiple control vectors | +| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE
note: use comma-separated values (format: FNAME:SCALE,...) | +| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | +| `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | +| `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | +| `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | +| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | +| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | +| `-hfv, -hfrv, --hf-repo-v /[:quant]` | Hugging Face model repository for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_REPO_V) | +| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_FILE_V) | +| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | +| `--log-disable` | Log disable | +| `--log-file FNAME` | Log to file
(env: LLAMA_LOG_FILE) | +| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) | +| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) | +| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) | +| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)

(env: LLAMA_LOG_VERBOSITY) | +| `--log-prefix` | Enable prefix in log messages
(env: LLAMA_LOG_PREFIX) | +| `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | +| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) | +| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | + + +### Sampling params + +| Argument | Explanation | +| -------- | ----------- | +| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) | +| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | +| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | +| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | +| `--temp N` | temperature (default: 0.80) | +| `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | +| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | +| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) | +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) | +| `--dry-base N` | set DRY sampling base value (default: 1.75) | +| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | +| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | +| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) | +| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.
(valid range 0.0 to 0.99) (default: 0.90) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) | +| `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | +| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar-file FNAME` | file to read grammar from | +| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | +| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | +| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)
(env: LLAMA_ARG_BACKEND_SAMPLING) | + + +### CLI-specific params + +| Argument | Explanation | +| -------- | ----------- | +| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) | +| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal | +| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | +| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | +| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) | +| `--show-timings, --no-show-timings` | whether to show timing information after each response (default: true)
(env: LLAMA_ARG_SHOW_TIMINGS) | +| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) | +| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | +| `-sp, --special` | special tokens output enabled (default: false) | +| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:
- does not print special tokens and suffix/prefix
- interactive mode is also enabled
(default: auto enabled if chat template is available) | +| `-st, --single-turn` | run conversation for a single turn only, then exit when done
will not be interactive if first turn is predefined with --prompt
(default: false) | +| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' | +| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) | +| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md
note: if -hf is used, this argument can be omitted
(env: LLAMA_ARG_MMPROJ) | +| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | +| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)
(env: LLAMA_ARG_MMPROJ_AUTO) | +| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)
(env: LLAMA_ARG_MMPROJ_OFFLOAD) | +| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files | +| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | +| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | +| `-otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | +| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | +| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | +| `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | +| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | +| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | +| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | +| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | +| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_DRAFT_P_MIN) | +| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | +| `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | +| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | +| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | +| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | +| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) | +| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) | +| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) | +| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) | + + diff --git a/llama.cpp/tools/cli/cli.cpp b/llama.cpp/tools/cli/cli.cpp new file mode 100644 index 0000000..02ccb72 --- /dev/null +++ b/llama.cpp/tools/cli/cli.cpp @@ -0,0 +1,421 @@ +#include "common.h" +#include "arg.h" +#include "console.h" +// #include "log.h" + +#include "server-context.h" +#include "server-task.h" + +#include +#include +#include +#include + +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + +const char * LLAMA_ASCII_LOGO = R"( +▄▄ ▄▄ +██ ██ +██ ██ ▀▀█▄ ███▄███▄ ▀▀█▄ ▄████ ████▄ ████▄ +██ ██ ▄█▀██ ██ ██ ██ ▄█▀██ ██ ██ ██ ██ ██ +██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀ + ██ ██ + ▀▀ ▀▀ +)"; + +static std::atomic g_is_interrupted = false; +static bool should_stop() { + return g_is_interrupted.load(); +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +static void signal_handler(int) { + if (g_is_interrupted.load()) { + // second Ctrl+C - exit immediately + // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock) + fprintf(stdout, "\033[0m\n"); + fflush(stdout); + std::exit(130); + } + g_is_interrupted.store(true); +} +#endif + +struct cli_context { + server_context ctx_server; + json messages = json::array(); + std::vector input_files; + task_params defaults; + + // thread for showing "loading" animation + std::atomic loading_show; + + cli_context(const common_params & params) { + defaults.sampling = params.sampling; + defaults.speculative = params.speculative; + defaults.n_keep = params.n_keep; + defaults.n_predict = params.n_predict; + defaults.antiprompt = params.antiprompt; + + defaults.stream = true; // make sure we always use streaming mode + defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way + // defaults.return_progress = true; // TODO: show progress + } + + std::string generate_completion(result_timings & out_timings) { + server_response_reader rd = ctx_server.get_response_reader(); + auto chat_params = format_chat(); + { + // TODO: reduce some copies here in the future + server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); + task.id = rd.get_new_id(); + task.index = 0; + task.params = defaults; // copy + task.cli_prompt = chat_params.prompt; // copy + task.cli_files = input_files; // copy + task.cli = true; + + // chat template settings + task.params.chat_parser_params = common_chat_parser_params(chat_params); + task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + if (!chat_params.parser.empty()) { + task.params.chat_parser_params.parser.load(chat_params.parser); + } + + rd.post_task({std::move(task)}); + } + + // wait for first result + console::spinner::start(); + server_task_result_ptr result = rd.next(should_stop); + + console::spinner::stop(); + std::string curr_content; + bool is_thinking = false; + + while (result) { + if (should_stop()) { + break; + } + if (result->is_error()) { + json err_data = result->to_json(); + if (err_data.contains("message")) { + console::error("Error: %s\n", err_data["message"].get().c_str()); + } else { + console::error("Error: %s\n", err_data.dump().c_str()); + } + return curr_content; + } + auto res_partial = dynamic_cast(result.get()); + if (res_partial) { + out_timings = std::move(res_partial->timings); + for (const auto & diff : res_partial->oaicompat_msg_diffs) { + if (!diff.content_delta.empty()) { + if (is_thinking) { + console::log("\n[End thinking]\n\n"); + console::set_display(DISPLAY_TYPE_RESET); + is_thinking = false; + } + curr_content += diff.content_delta; + console::log("%s", diff.content_delta.c_str()); + console::flush(); + } + if (!diff.reasoning_content_delta.empty()) { + console::set_display(DISPLAY_TYPE_REASONING); + if (!is_thinking) { + console::log("[Start thinking]\n"); + } + is_thinking = true; + console::log("%s", diff.reasoning_content_delta.c_str()); + console::flush(); + } + } + } + auto res_final = dynamic_cast(result.get()); + if (res_final) { + out_timings = std::move(res_final->timings); + break; + } + result = rd.next(should_stop); + } + g_is_interrupted.store(false); + // server_response_reader automatically cancels pending tasks upon destruction + return curr_content; + } + + // TODO: support remote files in the future (http, https, etc) + std::string load_input_file(const std::string & fname, bool is_media) { + std::ifstream file(fname, std::ios::binary); + if (!file) { + return ""; + } + if (is_media) { + raw_buffer buf; + buf.assign((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + input_files.push_back(std::move(buf)); + return mtmd_default_marker(); + } else { + std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + return content; + } + } + + common_chat_params format_chat() { + auto meta = ctx_server.get_meta(); + auto & chat_params = meta.chat_params; + + common_chat_templates_inputs inputs; + inputs.messages = common_chat_msgs_parse_oaicompat(messages); + inputs.tools = {}; // TODO + inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; + inputs.json_schema = ""; // TODO + inputs.grammar = ""; // TODO + inputs.use_jinja = chat_params.use_jinja; + inputs.parallel_tool_calls = false; + inputs.add_generation_prompt = true; + inputs.enable_thinking = chat_params.enable_thinking; + + // Apply chat template to the list of messages + return common_chat_templates_apply(chat_params.tmpls.get(), inputs); + } +}; + +int main(int argc, char ** argv) { + common_params params; + + params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) { + return 1; + } + + // TODO: maybe support it later? + if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) { + console::error("--no-conversation is not supported by llama-cli\n"); + console::error("please use llama-completion instead\n"); + } + + common_init(); + + // struct that contains llama context and inference + cli_context ctx_cli(params); + + llama_backend_init(); + llama_numa_init(params.numa); + + // TODO: avoid using atexit() here by making `console` a singleton + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); + + console::set_display(DISPLAY_TYPE_RESET); + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = signal_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); + sigaction(SIGTERM, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + + console::log("\nLoading model... "); // followed by loading animation + console::spinner::start(); + if (!ctx_cli.ctx_server.load_model(params)) { + console::spinner::stop(); + console::error("\nFailed to load the model\n"); + return 1; + } + + console::spinner::stop(); + console::log("\n"); + + std::thread inference_thread([&ctx_cli]() { + ctx_cli.ctx_server.start_loop(); + }); + + auto inf = ctx_cli.ctx_server.get_meta(); + std::string modalities = "text"; + if (inf.has_inp_image) { + modalities += ", vision"; + } + if (inf.has_inp_audio) { + modalities += ", audio"; + } + + if (!params.system_prompt.empty()) { + ctx_cli.messages.push_back({ + {"role", "system"}, + {"content", params.system_prompt} + }); + } + + console::log("\n"); + console::log("%s\n", LLAMA_ASCII_LOGO); + console::log("build : %s\n", inf.build_info.c_str()); + console::log("model : %s\n", inf.model_name.c_str()); + console::log("modalities : %s\n", modalities.c_str()); + if (!params.system_prompt.empty()) { + console::log("using custom system prompt\n"); + } + console::log("\n"); + console::log("available commands:\n"); + console::log(" /exit or Ctrl+C stop or exit\n"); + console::log(" /regen regenerate the last response\n"); + console::log(" /clear clear the chat history\n"); + console::log(" /read add a text file\n"); + if (inf.has_inp_image) { + console::log(" /image add an image file\n"); + } + if (inf.has_inp_audio) { + console::log(" /audio add an audio file\n"); + } + console::log("\n"); + + // interactive loop + std::string cur_msg; + while (true) { + std::string buffer; + console::set_display(DISPLAY_TYPE_USER_INPUT); + if (params.prompt.empty()) { + console::log("\n> "); + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + } else { + // process input prompt from args + for (auto & fname : params.image) { + std::string marker = ctx_cli.load_input_file(fname, true); + if (marker.empty()) { + console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + break; + } + console::log("Loaded media from '%s'\n", fname.c_str()); + cur_msg += marker; + } + buffer = params.prompt; + if (buffer.size() > 500) { + console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()); + } else { + console::log("\n> %s\n", buffer.c_str()); + } + params.prompt.clear(); // only use it once + } + console::set_display(DISPLAY_TYPE_RESET); + console::log("\n"); + + if (should_stop()) { + g_is_interrupted.store(false); + break; + } + + // remove trailing newline + if (!buffer.empty() &&buffer.back() == '\n') { + buffer.pop_back(); + } + + // skip empty messages + if (buffer.empty()) { + continue; + } + + bool add_user_msg = true; + + // process commands + if (string_starts_with(buffer, "/exit")) { + break; + } else if (string_starts_with(buffer, "/regen")) { + if (ctx_cli.messages.size() >= 2) { + size_t last_idx = ctx_cli.messages.size() - 1; + ctx_cli.messages.erase(last_idx); + add_user_msg = false; + } else { + console::error("No message to regenerate.\n"); + continue; + } + } else if (string_starts_with(buffer, "/clear")) { + ctx_cli.messages.clear(); + ctx_cli.input_files.clear(); + console::log("Chat history cleared.\n"); + continue; + } else if ( + (string_starts_with(buffer, "/image ") && inf.has_inp_image) || + (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) { + // just in case (bad copy-paste for example), we strip all trailing/leading spaces + std::string fname = string_strip(buffer.substr(7)); + std::string marker = ctx_cli.load_input_file(fname, true); + if (marker.empty()) { + console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + continue; + } + cur_msg += marker; + console::log("Loaded media from '%s'\n", fname.c_str()); + continue; + } else if (string_starts_with(buffer, "/read ")) { + std::string fname = string_strip(buffer.substr(6)); + std::string marker = ctx_cli.load_input_file(fname, false); + if (marker.empty()) { + console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); + continue; + } + cur_msg += marker; + console::log("Loaded text from '%s'\n", fname.c_str()); + continue; + } else { + // not a command + cur_msg += buffer; + } + + // generate response + if (add_user_msg) { + ctx_cli.messages.push_back({ + {"role", "user"}, + {"content", cur_msg} + }); + cur_msg.clear(); + } + result_timings timings; + std::string assistant_content = ctx_cli.generate_completion(timings); + ctx_cli.messages.push_back({ + {"role", "assistant"}, + {"content", assistant_content} + }); + console::log("\n"); + + if (params.show_timings) { + console::set_display(DISPLAY_TYPE_INFO); + console::log("\n"); + console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second); + console::set_display(DISPLAY_TYPE_RESET); + } + + if (params.single_turn) { + break; + } + } + + console::set_display(DISPLAY_TYPE_RESET); + + console::log("\nExiting...\n"); + ctx_cli.ctx_server.terminate(); + inference_thread.join(); + + // bump the log level to display timings + common_log_set_verbosity_thold(LOG_LEVEL_INFO); + llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); + + return 0; +} diff --git a/llama.cpp/tools/completion/CMakeLists.txt b/llama.cpp/tools/completion/CMakeLists.txt new file mode 100644 index 0000000..126ae6a --- /dev/null +++ b/llama.cpp/tools/completion/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-completion) +add_executable(${TARGET} completion.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/llama.cpp/tools/completion/README.md b/llama.cpp/tools/completion/README.md new file mode 100644 index 0000000..3ca3e68 --- /dev/null +++ b/llama.cpp/tools/completion/README.md @@ -0,0 +1,578 @@ +# llama.cpp/tools/completion + +This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. + +## Table of Contents + +1. [Quick Start](#quick-start) +2. [Usage](#usage) +3. [Common Options](#common-options) +4. [Input Prompts](#input-prompts) +5. [Interaction](#interaction) +6. [Context Management](#context-management) +7. [Generation Flags](#generation-flags) +8. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options) +9. [Additional Options](#additional-options) + +## Quick Start + +To get started right away, run the following command, making sure to use the correct path for the model you have: + +First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face. +[https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true) + +Once downloaded, place your model in the models folder in llama.cpp. + +### Unix-based systems (Linux, macOS, etc.): + +##### Input prompt (One-and-done) + +```bash +./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time" +``` +##### Conversation mode (Allow for continuous interaction with the model) + +```bash +./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma +``` + +##### Conversation mode using built-in jinja chat template + +```bash +./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja +``` + +##### One-and-done query using jinja with custom system prompt and a starting prompt + +```bash +./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello" +``` + +##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): +```bash +./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 +``` + +### Windows: + +##### Input prompt (One-and-done) +```powershell +./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time" +``` +##### Conversation mode (Allow for continuous interaction with the model) + +```powershell +./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma +``` + +##### Conversation mode using built-in jinja chat template + +```powershell +./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja +``` + +##### One-and-done query using jinja with custom system prompt and a starting prompt + +```powershell +./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello" +``` + +#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): + +```powershell +llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 +``` + +## Usage + + + + + +### Common params + +| Argument | Explanation | +| -------- | ----------- | +| `-h, --help, --usage` | print usage and exit | +| `--version` | show version and build info | +| `--license` | show source code license and dependencies | +| `-cl, --cache-list` | show list of models in cache | +| `--completion-bash` | print source-able bash completion script for llama.cpp | +| `--verbose-prompt` | print a verbose prompt before generation (default: false) | +| `-t, --threads N` | number of CPU threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | +| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | +| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | +| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | +| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) | +| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) | +| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) | +| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | +| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | +| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | +| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) | +| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | +| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | +| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
(env: LLAMA_ARG_N_PREDICT) | +| `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | +| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | +| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | +| `--swa-full` | use full-size SWA cache (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
(env: LLAMA_ARG_SWA_FULL) | +| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
(env: LLAMA_ARG_FLASH_ATTN) | +| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys | +| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)
(env: LLAMA_ARG_PERF) | +| `-f, --file FNAME` | a file containing the prompt (default: none) | +| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | +| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | +| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | +| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N
(env: LLAMA_ARG_ROPE_SCALE) | +| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | +| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | +| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | +| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | +| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | +| `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | +| `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | +| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | +| `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | +| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | +| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | +| `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | +| `--list-devices` | print list of available devices and exit | +| `-ot, --override-tensor =,...` | override tensor buffer type
(env: LLAMA_ARG_OVERRIDE_TENSOR) | +| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | +| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | +| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS) | +| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | +| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | +| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | +| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')
(env: LLAMA_ARG_FIT) | +| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024
(env: LLAMA_ARG_FIT_TARGET) | +| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096
(env: LLAMA_ARG_FIT_CTX) | +| `--check-tensors` | check model tensor data for invalid values (default: false) | +| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false | +| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) | +| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) | +| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)
note: use comma-separated values | +| `--control-vector FNAME` | add a control vector
note: use comma-separated values to add multiple control vectors | +| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE
note: use comma-separated values (format: FNAME:SCALE,...) | +| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | +| `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | +| `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | +| `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | +| `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | +| `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | +| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | +| `-hfv, -hfrv, --hf-repo-v /[:quant]` | Hugging Face model repository for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_REPO_V) | +| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)
(env: LLAMA_ARG_HF_FILE_V) | +| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | +| `--log-disable` | Log disable | +| `--log-file FNAME` | Log to file
(env: LLAMA_LOG_FILE) | +| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) | +| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) | +| `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) | +| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
(default: 3)

(env: LLAMA_LOG_VERBOSITY) | +| `--log-prefix` | Enable prefix in log messages
(env: LLAMA_LOG_PREFIX) | +| `--log-timestamps` | Enable timestamps in log messages
(env: LLAMA_LOG_TIMESTAMPS) | +| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) | +| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) | + + +### Sampling params + +| Argument | Explanation | +| -------- | ----------- | +| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) | +| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | +| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | +| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | +| `--temp N` | temperature (default: 0.80) | +| `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | +| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | +| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) | +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) | +| `--dry-base N` | set DRY sampling base value (default: 1.75) | +| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | +| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | +| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) | +| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.
(valid range 0.0 to 0.99) (default: 0.90) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) | +| `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | +| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar-file FNAME` | file to read grammar from | +| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | +| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | +| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)
(env: LLAMA_ARG_BACKEND_SAMPLING) | + + +### Completion-specific params + +| Argument | Explanation | +| -------- | ----------- | +| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) | +| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal | +| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | +| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) | +| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) | +| `-ptc, --print-token-count N` | print token count every N tokens (default: -1) | +| `--prompt-cache FNAME` | file to cache prompt state for faster startup (default: none) | +| `--prompt-cache-all` | if specified, saves user input and generations to cache as well | +| `--prompt-cache-ro` | if specified, uses the prompt cache but does not update it | +| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | +| `-sp, --special` | special tokens output enabled (default: false) | +| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:
- does not print special tokens and suffix/prefix
- interactive mode is also enabled
(default: auto enabled if chat template is available) | +| `-st, --single-turn` | run conversation for a single turn only, then exit when done
will not be interactive if first turn is predefined with --prompt
(default: false) | +| `-i, --interactive` | run in interactive mode (default: false) | +| `-if, --interactive-first` | run in interactive mode and wait for input right away (default: false) | +| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' | +| `--in-prefix-bos` | prefix BOS to user inputs, preceding the `--in-prefix` string | +| `--in-prefix STRING` | string to prefix user inputs with (default: empty) | +| `--in-suffix STRING` | string to suffix after user inputs with (default: empty) | +| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) | +| `-gan, --grp-attn-n N` | group-attention factor (default: 1)
(env: LLAMA_ARG_GRP_ATTN_N) | +| `-gaw, --grp-attn-w N` | group-attention width (default: 512)
(env: LLAMA_ARG_GRP_ATTN_W) | +| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | +| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | + + + +## Common Options + +In this section, we cover the most commonly used options for running the `llama-completion` program with the LLaMA models: + +- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)). +- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. +- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. +- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference. +- `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\' +- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has. +- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. + +## Input Prompts + +The `llama-completion` program provides several ways to interact with the LLaMA models using input prompts: + +- `--prompt PROMPT`: Provide a prompt directly as a command-line option. +- `--file FNAME`: Provide a file containing a prompt or multiple prompts. +- `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)). +- `--system-prompt-file FNAME`: Provide a file containing a system prompt. +- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) + +## Interaction + +The `llama-completion` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`. + +In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing. + +### Interaction Options + +- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model. +- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. +- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found) +- `-no-cnv`: Disable conversation mode (default: false) +- `-st, --single-turn`: Only process a single conversation turn (user input) and then exit. +- `--jinja`: Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false) +- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. + +By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. + +### Reverse Prompts + +Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered: + +- `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space. + +To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt. + +### In-Prefix + +The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag: + +```sh +./llama-completion -r "User:" --in-prefix " " +``` + +### In-Suffix + +The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag: + +```sh +./llama-completion -r "User:" --in-prefix " " --in-suffix "Assistant:" +``` +When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled + +### Chat templates + + `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled. + + Example usage: `--chat-template gemma` + +`--chat-template-file FNAME`: Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py + +## Context Management + +During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations. + +### Context Size + +- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference. + +### Extended Context Size + +Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. + +- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. + +### Keep Prompt + +The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained. + +- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. + +By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation. + +## Generation Flags + +The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case. + +### Number of Tokens to Predict + +- `-n N, --predict N`: Set the number of tokens to predict when generating text (default: -1, -1 = infinity, -2 = until context filled) + +The `--predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. + +A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in a significant pause in output. + +If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled. + +The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full. + +It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter. + +### Temperature + +- `--temp N`: Adjust the randomness of the generated text (default: 0.8). + +Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. + +Example usage: `--temp 0` + +### Repeat Penalty + +- `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled). +- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). + +The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1. + +The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`). + +### DRY Repetition Penalty + +DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)). + +- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled). +- `--dry-base N`: Set the DRY sampling base value (default: 1.75). +- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2). +- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size). +- `--dry-sequence-breaker STRING`: Add a sequence breaker for DRY sampling. Can be used more than once to add multiple sequence breakers. Using this clears out the default breakers, which consist of: `['\n', ':', '"', '*']`. If the string `"none"` is supplied, no sequence breakers are used. + +The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8. + +The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions. + +The `dry-allowed-length` option sets the maximum length of repeated sequences that will not be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words. + +The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context. Use a positive value to limit the consideration to a specific number of recent tokens. + +The `dry-sequence-breaker` option adds a single sequence breaker and can be used more than once to specify multiple sequence breakers. Sequence breakers interrupt sequence matching and break the input into parts where matching can be applied. + +DRY sampling provides more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence. + +Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1 --dry-sequence-breaker "—" --dry-sequence-breaker "##"` + +### Top-K Sampling + +- `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40). + +Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40. + +Example usage: `--top-k 30` + +### Top-P Sampling + +- `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). + +Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9. + +Example usage: `--top-p 0.95` + +### Min-P Sampling + +- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.1). + +The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. + +Example usage: `--min-p 0.05` + +### Adaptive-P Sampling + +- `--adaptive-target N`: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) +- `--adaptive-decay N`: EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) + +Adaptive-P: Select tokens near a configurable target probability over time. + +The adaptive-p sampler transforms the token probability distribution to favor tokens that fall near a user-configurable probability target. Internally, the sampler maintains an exponential moving average of the *ORIGINAL* probabilities of selected tokens at each sampling step. It uses this EMA to compute an adapted target probability at each sampling step, thus maintaining the desired target probability over time. Only mild truncation before this sampler is recommended. It is suggested to apply min-p before adaptive-p as the only other active sampler. + +Recommended starting values: `--adaptive-target 0.55 --adaptive-decay 0.9` + +For more info, refer to: [llama.cpp#17927](https://github.com/ggml-org/llama.cpp/pull/17927) + +### Locally Typical Sampling + +- `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). + +Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling. + +Example usage: `--typical 0.9` + +### Mirostat Sampling + +- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). +- `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1). +- `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0). + +Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps). + +The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`. + +The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`. + +Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0` + +### XTC Sampling + +- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0). +- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1). + +Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one. + +By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models. + +Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`. + +Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1` + +### Top-nσ Sampling + +- `--top-nsigma N`: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1, -1 = disabled). + +Top-nσ sampling is a text generation method that selects tokens based on a statistical threshold in pre-softmax logits. It works by only sampling from tokens with logits that are within n * σ of the maximum logit. This method helps maintain a stable sampling space regardless of temperature scaling, allowing it to perform well on reasoning tasks even in high temperatures. Without complex probability manipulation, it efficiently filters tokens directly on the pre-softmax logits. A higher value for top-nsigma (e.g., 5) will take more noisy tokens into consideration, while a lower value (e.g., 1) will focous on the more informative region of the sampling space. + +Example usage: `--top-nsigma 1` + +### Logit Bias + +- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion. + +The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated. + +For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced. + +A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.) + +Example usage: `--logit-bias 29905-inf` + +### RNG Seed + +- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed). + +The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run. + +## Performance Tuning and Memory Options + +These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case. + +### Number of Threads + +- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance. +- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. In some systems, it is beneficial to use a higher number of threads during batch processing than during generation. If not specified, the number of threads used for batch processing will be the same as the number of threads used for generation. + +### Mlock + +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM. + +### No Memory Mapping + +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all. + +### NUMA support + +- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes. +- `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node. +- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus. + + These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root. + +### Batch Size + +- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`. + +- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`. + +### Prompt Caching + +- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. + +### Grammars & JSON schemas + +- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax. + +- `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead. + +### Quantization + +For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize). + +## LoRA (Low-Rank Adaptation) adapters + +- `--lora FNAME`: Optional path to a LoRA adapter to use with scaling of 1.0. Can be mixed with `--lora-scaled` and can be repeated to use multiple adapters. +- `--lora-scaled FNAME`: Optional path to a LoRA adapter with user-defined scaling. Can be mixed with `--lora` and can repeated to use multiple adapters. + +You can add LoRA adapters using `--lora` or `--lora-scaled`. For example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` or `--lora-scaled lora_task_A.gguf 0.5 --lora-scaled lora_task_B.gguf 0.5`. + +LoRA adapters should be in GGUF format. To convert from Hugging Face format use the `convert-lora-to-gguf.py` script. LoRA adapters are loaded separately and applied during inference - they are not merged with the main model. This means that mmap model loading is fully supported when using LoRA adapters. The old `--lora-base` flag has been removed now that merging is no longer performed. + +## Additional Options + +These options provide extra functionality and customization when running the LLaMA models: + +- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. +- `--verbose-prompt`: Print the prompt before generating text. +- `--no-display-prompt`: Don't print prompt at generation. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple devices this option controls how tensors should be split across devices. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each device should get in order. For example, "3,2" will assign 60% of the data to device 0 and 40% to device 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. The list of the devices which are being used is printed on startup and can be different from the device list given by `--list-devices` or e.g. `nvidia-smi`. +- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache. diff --git a/llama.cpp/tools/completion/completion.cpp b/llama.cpp/tools/completion/completion.cpp new file mode 100644 index 0000000..9771327 --- /dev/null +++ b/llama.cpp/tools/completion/completion.cpp @@ -0,0 +1,1001 @@ +#include "arg.h" +#include "common.h" +#include "console.h" +#include "log.h" +#include "sampling.h" +#include "llama.h" +#include "chat.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static llama_context ** g_ctx; +static llama_model ** g_model; +static common_sampler ** g_smpl; +static common_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; +static bool is_interacting = false; +static bool need_insert_eot = false; + +static void print_usage(int argc, char ** argv) { + (void) argc; + + LOG("\nexample usage:\n"); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]); + LOG("\n"); +} + +static bool file_exists(const std::string & path) { + std::ifstream f(path.c_str()); + return f.good(); +} + +static bool file_is_empty(const std::string & path) { + std::ifstream f; + f.exceptions(std::ifstream::failbit | std::ifstream::badbit); + f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + return f.tellg() == 0; +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +static void sigint_handler(int signo) { + if (signo == SIGINT) { + if (!is_interacting && g_params->interactive) { + is_interacting = true; + need_insert_eot = true; + } else { + console::cleanup(); + LOG("\n"); + common_perf_print(*g_ctx, *g_smpl); + + // make sure all logs are flushed + LOG("Interrupted by user\n"); + common_log_pause(common_log_main()); + + _exit(130); + } + } +} +#endif + +int main(int argc, char ** argv) { + common_params params; + g_params = ¶ms; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) { + return 1; + } + + common_init(); + + auto & sparams = params.sampling; + + // save choice to use color for later + // (note for later: this is a slightly awkward choice) + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); + + if (params.embedding) { + LOG_ERR("************\n"); + LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + LOG_ERR("************\n\n"); + + return 0; + } + + if (params.n_ctx != 0 && params.n_ctx < 8) { + LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + if (params.rope_freq_base != 0.0) { + LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 0.0) { + LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + } + + LOG_INF("%s: llama backend init\n", __func__); + + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model = nullptr; + llama_context * ctx = nullptr; + common_sampler * smpl = nullptr; + + g_model = &model; + g_ctx = &ctx; + g_smpl = &smpl; + + std::vector chat_msgs; + + // load the model and apply lora adapter, if any + LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); + + auto llama_init = common_init_from_params(params); + + ctx = llama_init->context(); + model = llama_init->model(); + smpl = llama_init->sampler(0); + + if (ctx == NULL) { + LOG_ERR("%s: error: unable to create context\n", __func__); + return 1; + } + + llama_memory_t mem = llama_get_memory(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + // note: the time for chat template initialization is not negligible: + auto chat_templates = common_chat_templates_init(model, params.chat_template); + + // start measuring performance timings from here + llama_perf_context_reset(ctx); + + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + LOG_ERR("%s: no CPU backend found\n", __func__); + return 1; + } + auto * reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + + struct ggml_threadpool_params tpp_batch = + ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); + struct ggml_threadpool_params tpp = + ggml_threadpool_params_from_cpu_params(params.cpuparams); + + if (!set_process_priority(params.cpuparams.priority)) { + LOG_ERR("%s: error: failed to set process priority\n", __func__); + return 1; + } + + struct ggml_threadpool * threadpool_batch = NULL; + if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); + if (!threadpool_batch) { + LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); + return 1; + } + + // start the non-batch threadpool in the paused state + tpp.paused = true; + } + + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); + if (!threadpool) { + LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + return 1; + } + + llama_attach_threadpool(ctx, threadpool, threadpool_batch); + + const int n_ctx_train = llama_model_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + + if (n_ctx > n_ctx_train) { + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); + } + + // auto enable conversation mode if chat template is available + const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get()); + if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) { + if (has_chat_template) { + LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); + params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; + } else { + params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; + } + } + + // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning + if (params.conversation_mode && !has_chat_template) { + LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__); + } + + // print chat template example in conversation mode + if (params.conversation_mode) { + if (params.enable_chat_template) { + if (!params.prompt.empty() && params.system_prompt.empty()) { + LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n"); + } + + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs).c_str()); + } else { + LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); + } + } + + // print system information + { + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + LOG_INF("\n"); + } + + std::string path_session = params.path_prompt_cache; + std::vector session_tokens; + + if (!path_session.empty()) { + LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + if (!file_exists(path_session)) { + LOG_INF("%s: session file does not exist, will create.\n", __func__); + } else if (file_is_empty(path_session)) { + LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__); + } else { + // The file exists and is not empty + session_tokens.resize(n_ctx); + size_t n_token_count_out = 0; + if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { + LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str()); + return 1; + } + session_tokens.resize(n_token_count_out); + LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); + } + } + + const bool add_bos = llama_vocab_get_add_bos(vocab) && !params.use_jinja; + if (!llama_model_has_encoder(model)) { + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + } + + LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); + + std::vector embd_inp; + + bool waiting_for_first_input = false; + auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) { + common_chat_msg new_msg; + new_msg.role = role; + new_msg.content = content; + auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja); + chat_msgs.push_back(new_msg); + LOG_DBG("formatted: '%s'\n", formatted.c_str()); + return formatted; + }; + + std::string prompt; + { + if (params.conversation_mode && params.enable_chat_template) { + if (!params.system_prompt.empty()) { + // format the system prompt (will use template default if empty) + chat_add_and_format("system", params.system_prompt); + } + + if (!params.prompt.empty()) { + // format and append the user prompt + chat_add_and_format("user", params.prompt); + } else { + waiting_for_first_input = true; + } + + if (!params.system_prompt.empty() || !params.prompt.empty()) { + common_chat_templates_inputs inputs; + inputs.use_jinja = g_params->use_jinja; + inputs.messages = chat_msgs; + inputs.add_generation_prompt = !params.prompt.empty(); + + prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt; + } + } else { + // otherwise use the prompt as is + prompt = params.prompt; + } + + if (params.interactive_first || !prompt.empty() || session_tokens.empty()) { + LOG_DBG("tokenize the prompt\n"); + embd_inp = common_tokenize(ctx, prompt, true, true); + } else { + LOG_DBG("use session tokens\n"); + embd_inp = session_tokens; + } + + LOG_DBG("prompt: \"%s\"\n", prompt.c_str()); + LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str()); + } + + // Should not run without any tokens + if (!waiting_for_first_input && embd_inp.empty()) { + if (add_bos) { + embd_inp.push_back(llama_vocab_bos(vocab)); + LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); + } else { + LOG_ERR("input is empty\n"); + return -1; + } + } + + // Tokenize negative prompt + if ((int) embd_inp.size() > n_ctx - 4) { + LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + return 1; + } + + bool session_do_save = false; + + { + size_t n_match = 0; + + if (!session_tokens.empty()) { + for (llama_token id : session_tokens) { + if (n_match >= embd_inp.size() || id != embd_inp[n_match]) { + break; + } + n_match++; + } + if (params.prompt.empty() && n_match == embd_inp.size()) { + LOG_INF("%s: using full prompt from session file\n", __func__); + } else if (n_match >= embd_inp.size()) { + LOG_INF("%s: session file has exact match for prompt!\n", __func__); + } else if (n_match < (embd_inp.size() / 2)) { + LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_match, embd_inp.size()); + } else { + LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_match, embd_inp.size()); + } + + if (session_tokens.size() == n_match) { + // [TAG_CONTEXT_STATE_LOGITS] + // in this case, we are going to reuse the logits from the session + // if we ever decide to remove the logits from the session, we need to handle this somehow + // ref: https://github.com/ggml-org/llama.cpp/pull/18862#issuecomment-3756330941 + } + + // remove any "future" tokens that we might have inherited from the previous session + if (session_tokens.size() > n_match) { + if (!llama_memory_seq_rm(mem, -1, n_match, -1)) { + LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__); + llama_memory_clear(mem, true); + session_tokens.clear(); + n_match = 0; + } else { + session_tokens.resize(n_match); + } + } + } + + session_do_save = !path_session.empty() && n_match < embd_inp.size() && !params.prompt_cache_ro; + } + + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) { + params.n_keep = (int)embd_inp.size(); + } else { + params.n_keep += add_bos; // always keep the BOS token + } + + if (params.conversation_mode) { + if (params.single_turn && !params.prompt.empty()) { + params.interactive = false; + params.interactive_first = false; + } else { + params.interactive_first = true; + } + } + + // enable interactive mode if interactive start is specified + if (params.interactive_first) { + params.interactive = true; + } + + if (params.verbose_prompt) { + LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str()); + } + + if (params.n_keep > add_bos) { + LOG_INF("%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) { + LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str()); + } + LOG_CNT("'\n"); + } + LOG_INF("\n"); + } + + // ctrl+C handling + { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + } + + if (params.interactive) { + LOG_INF("%s: interactive mode on.\n", __func__); + + if (!params.antiprompt.empty()) { + for (const auto & antiprompt : params.antiprompt) { + LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str()); + if (params.verbose_prompt) { + auto tmp = common_tokenize(ctx, antiprompt, false, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + } + + if (params.input_prefix_bos) { + LOG_INF("Input prefix with BOS\n"); + } + + if (!params.input_prefix.empty()) { + LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str()); + if (params.verbose_prompt) { + auto tmp = common_tokenize(ctx, params.input_prefix, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + + if (!params.input_suffix.empty()) { + LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str()); + if (params.verbose_prompt) { + auto tmp = common_tokenize(ctx, params.input_suffix, false, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + } + + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); + LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); + + LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + + // group-attention state + // number of grouped KV tokens so far (used only if params.grp_attn_n > 1) + int ga_i = 0; + + const int ga_n = params.grp_attn_n; + const int ga_w = params.grp_attn_w; + + if (ga_n != 1) { + GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT + //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT + //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT + LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); + } + LOG_INF("\n"); + + if (params.interactive) { + const char * control_message; + if (params.multiline_input) { + control_message = " - To return control to the AI, end your input with '\\'.\n" + " - To return control without starting a new line, end your input with '/'.\n"; + } else { + control_message = " - Press Return to return control to the AI.\n" + " - To return control without starting a new line, end your input with '/'.\n" + " - If you want to submit another line, end your input with '\\'.\n"; + } + LOG_INF("== Running in interactive mode. ==\n"); +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + LOG_INF( " - Press Ctrl+C to interject at any time.\n"); +#endif + LOG_INF( "%s", control_message); + if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) { + LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n"); + } + LOG_INF("\n"); + + is_interacting = params.interactive_first; + } + + bool is_antiprompt = false; + bool input_echo = true; + bool display = true; + + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + int n_session_consumed = 0; + + std::vector input_tokens; g_input_tokens = &input_tokens; + std::vector output_tokens; g_output_tokens = &output_tokens; + std::ostringstream output_ss; g_output_ss = &output_ss; + std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode + + // the first thing we will do is to output the prompt, so set color accordingly + console::set_display(DISPLAY_TYPE_PROMPT); + display = params.display_prompt; + + std::vector embd; + + // single-token antiprompts + std::vector antiprompt_token; + + for (const std::string & antiprompt : params.antiprompt) { + auto ids = ::common_tokenize(ctx, antiprompt, false, true); + if (ids.size() == 1) { + antiprompt_token.push_back(ids[0]); + } + } + + if (llama_model_has_encoder(model)) { + int enc_input_size = embd_inp.size(); + llama_token * enc_input_buf = embd_inp.data(); + + if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) { + LOG_ERR("%s : failed to eval\n", __func__); + return 1; + } + + llama_token decoder_start_token_id = llama_model_decoder_start_token(model); + if (decoder_start_token_id == LLAMA_TOKEN_NULL) { + decoder_start_token_id = llama_vocab_bos(vocab); + } + + embd_inp.clear(); + embd_inp.push_back(decoder_start_token_id); + } + + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { + // predict + if (!embd.empty()) { + // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + int max_embd_size = n_ctx - 4; + + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int) embd.size() > max_embd_size) { + const int skipped_tokens = (int) embd.size() - max_embd_size; + embd.resize(max_embd_size); + + console::set_display(DISPLAY_TYPE_ERROR); + LOG_WRN("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console::set_display(DISPLAY_TYPE_RESET); + } + + if (ga_n == 1) { + // infinite text generation via context shifting + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + + if (n_past + (int) embd.size() >= n_ctx) { + if (!params.ctx_shift){ + LOG_WRN("\n\n%s: context full and context shift is disabled => stopping\n", __func__); + break; + } + + if (params.n_predict == -2) { + LOG_WRN("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_memory_seq_rm (mem, 0, params.n_keep , params.n_keep + n_discard); + llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); + } + } else { + // context extension via Self-Extend + while (n_past >= ga_i + ga_w) { + const int ib = (ga_n*ga_i)/ga_w; + const int bd = (ga_w/ga_n)*(ga_n - 1); + const int dd = (ga_w/ga_n) - ib*bd - ga_w; + + LOG_DBG("\n"); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd); + LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); + LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); + + llama_memory_seq_add(mem, 0, ga_i, n_past, ib*bd); + llama_memory_seq_div(mem, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_memory_seq_add(mem, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + + n_past -= bd; + + ga_i += ga_w/ga_n; + + LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i); + } + } + + // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + if (n_session_consumed < (int) session_tokens.size()) { + size_t i = 0; + for ( ; i < embd.size(); i++) { + if (embd[i] != session_tokens[n_session_consumed]) { + session_tokens.resize(n_session_consumed); + break; + } + + n_past++; + n_session_consumed++; + + if (n_session_consumed >= (int) session_tokens.size()) { + ++i; + break; + } + } + if (i > 0) { + embd.erase(embd.begin(), embd.begin() + i); + } + } + + if (!embd.empty()) { + int n_eval = (int) embd.size(); + LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); + + GGML_ASSERT(n_eval <= params.n_batch); + if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) { + LOG_ERR("%s : failed to eval\n", __func__); + return 1; + } + + n_past += n_eval; + + LOG_DBG("n_past = %d\n", n_past); + // Display total tokens alongside total time + if (params.n_print > 0 && n_past % params.n_print == 0) { + LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + } + } + + if (!embd.empty() && !path_session.empty()) { + session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); + n_session_consumed = session_tokens.size(); + } + } + + embd.clear(); + + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { + // optionally save the session on first sample (for faster prompt loading next time) + if (session_do_save) { + session_do_save = false; + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + + LOG_DBG("saved session to %s\n", path_session.c_str()); + } + + const llama_token id = common_sampler_sample(smpl, ctx, -1); + + common_sampler_accept(smpl, id, /* accept_grammar= */ true); + + // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str()); + + embd.push_back(id); + + if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) { + assistant_ss << common_token_to_piece(ctx, id, false); + } + + // echo this to console + input_echo = true; + + // decrement remaining sampling budget + --n_remain; + + LOG_DBG("n_remain: %d\n", n_remain); + } else { + // some user input remains from prompt or interaction, forward it to processing + LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); + + ++n_consumed; + if ((int) embd.size() == params.n_batch) { + break; + } + } + } + + // display text + if (input_echo && display) { + for (auto id : embd) { + const std::string token_str = common_token_to_piece(ctx, id, params.special); + + // Console/Stream Output + LOG("%s", token_str.c_str()); + + // Record Displayed Tokens To Log + // Note: Generated tokens are created one by one hence this check + if (embd.size() > 1) { + // Incoming Requested Tokens + input_tokens.push_back(id); + } else { + // Outgoing Generated Tokens + output_tokens.push_back(id); + output_ss << token_str; + } + } + } + + // reset color to default if there is no pending user input + if (input_echo && (int) embd_inp.size() == n_consumed) { + console::set_display(DISPLAY_TYPE_RESET); + display = true; + } + + // if not currently processing queued inputs; + if ((int) embd_inp.size() <= n_consumed) { + // check for reverse prompt in the last n_prev tokens + if (!params.antiprompt.empty()) { + const int n_prev = 32; + const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev); + + is_antiprompt = false; + // Check if each of the reverse prompts appears at the end of the output. + // If we're not running interactively, the reverse prompt might be tokenized with some following characters + // so we'll compensate for that by widening the search window a bit. + for (std::string & antiprompt : params.antiprompt) { + size_t extra_padding = params.interactive ? 0 : 2; + size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding) + ? last_output.length() - static_cast(antiprompt.length() + extra_padding) + : 0; + + if (last_output.find(antiprompt, search_start_pos) != std::string::npos) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; + } + } + + // check for reverse prompt using special tokens + // avoid calling common_sampler_last() if last_output is empty + if (!last_output.empty()) { + llama_token last_token = common_sampler_last(smpl); + for (auto token : antiprompt_token) { + if (token == last_token) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; + } + } + } + + if (is_antiprompt) { + LOG_DBG("found antiprompt: %s\n", last_output.c_str()); + } + } + + // deal with end of generation tokens in interactive mode + if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { + LOG_DBG("found an EOG token\n"); + + if (params.interactive) { + if (!params.antiprompt.empty()) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + is_antiprompt = true; + } + + if (params.enable_chat_template) { + chat_add_and_format("assistant", assistant_ss.str()); + } + is_interacting = true; + LOG("\n"); + } + } + + if (params.conversation_mode && !waiting_for_first_input) { + if (!prompt.empty()) { + prompt.clear(); + is_interacting = false; + } + } + + if ((n_past > 0 || waiting_for_first_input) && is_interacting) { + LOG_DBG("waiting for user input\n"); + + if (params.conversation_mode) { + LOG("\n> "); + } + + if (params.input_prefix_bos) { + LOG_DBG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_vocab_bos(vocab)); + } + + std::string buffer; + if (!params.input_prefix.empty() && !params.conversation_mode) { + LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + LOG("%s", params.input_prefix.c_str()); + } + + // color user input only + console::set_display(DISPLAY_TYPE_USER_INPUT); + display = params.display_prompt; + + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + + // done taking input, reset color + console::set_display(DISPLAY_TYPE_RESET); + display = true; + + if (buffer.empty()) { // Ctrl+D on empty line exits + LOG("EOF by user\n"); + break; + } + + if (buffer.back() == '\n') { + // Implement #587: + // If the user wants the text to end in a newline, + // this should be accomplished by explicitly adding a newline by using \ followed by return, + // then returning control by pressing return again. + buffer.pop_back(); + } + + if (buffer.empty()) { // Enter key on empty line lets the user pass control back + LOG_DBG("empty line, passing control back\n"); + } else { // Add tokens to embd only if the input buffer is non-empty + // append input suffix if any + if (!params.input_suffix.empty() && !params.conversation_mode) { + LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + LOG("%s", params.input_suffix.c_str()); + } + + LOG_DBG("buffer: '%s'\n", buffer.c_str()); + + const size_t original_size = embd_inp.size(); + + if (params.escape) { + string_process_escapes(buffer); + } + + bool format_chat = params.conversation_mode && params.enable_chat_template; + std::string user_inp = format_chat + ? chat_add_and_format("user", std::move(buffer)) + : std::move(buffer); + // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix) + const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat); + const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true); + + LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str()); + + // if user stop generation mid-way, we must add EOT to finish model's last response + if (need_insert_eot && format_chat) { + llama_token eot = llama_vocab_eot(vocab); + embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot); + need_insert_eot = false; + } + + embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); + + if (params.verbose_prompt) { + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size() - original_size); + } + + for (size_t i = original_size; i < embd_inp.size(); ++i) { + const llama_token token = embd_inp[i]; + const std::string token_str = common_token_to_piece(ctx, token); + output_tokens.push_back(token); + output_ss << token_str; + + if (params.verbose_prompt) { + LOG_INF("%6d -> '%s'\n", token, token_str.c_str()); + } + } + + // reset assistant message + assistant_ss.str(""); + + n_remain -= line_inp.size(); + LOG_DBG("n_remain: %d\n", n_remain); + } + + input_echo = false; // do not echo this again + } + + if (n_past > 0 || waiting_for_first_input) { + if (is_interacting) { + common_sampler_reset(smpl); + } + is_interacting = false; + + if (waiting_for_first_input && params.single_turn) { + params.interactive = false; + params.interactive_first = false; + } + waiting_for_first_input = false; + } + } + + // end of generation + if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) { + LOG(" [end of text]\n"); + break; + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). + if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { + n_remain = params.n_predict; + is_interacting = true; + } + } + + if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { + LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + + LOG("\n\n"); + common_perf_print(ctx, smpl); + + llama_backend_free(); + + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); + + return 0; +} diff --git a/llama.cpp/tools/cvector-generator/CMakeLists.txt b/llama.cpp/tools/cvector-generator/CMakeLists.txt new file mode 100644 index 0000000..baeb4d0 --- /dev/null +++ b/llama.cpp/tools/cvector-generator/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-cvector-generator) +add_executable(${TARGET} cvector-generator.cpp pca.hpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/llama.cpp/tools/cvector-generator/README.md b/llama.cpp/tools/cvector-generator/README.md new file mode 100644 index 0000000..6d5fd74 --- /dev/null +++ b/llama.cpp/tools/cvector-generator/README.md @@ -0,0 +1,45 @@ +# cvector-generator + +This example demonstrates how to generate a control vector using gguf models. + +Related PRs: +- [Add support for control vectors](https://github.com/ggml-org/llama.cpp/pull/5970) +- (Issue) [Generate control vector using llama.cpp](https://github.com/ggml-org/llama.cpp/issues/6880) +- [Add cvector-generator example](https://github.com/ggml-org/llama.cpp/pull/7514) + +## Examples + +```sh +# CPU only +./cvector-generator -m ./llama-3.Q4_K_M.gguf + +# With GPU +./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 + +# With advanced options +./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100 + +# Using mean value instead of PCA +./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean + +# To see help message +./cvector-generator -h +# Then, have a look at "cvector" section +``` + +## Tips and tricks + +If you have multiple lines per prompt, you can escape the newline character (change it to `\n`). For example: + +``` +<|im_start|>system\nAct like a person who is extremely happy.<|im_end|> +<|im_start|>system\nYou are in a very good mood today<|im_end|> +``` + +Example to use output file with `llama-cli`: + +(Tips: The control vector works better when apply to layers higher than 10) + +```sh +./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31 +``` diff --git a/llama.cpp/tools/cvector-generator/completions.txt b/llama.cpp/tools/cvector-generator/completions.txt new file mode 100644 index 0000000..abc45ff --- /dev/null +++ b/llama.cpp/tools/cvector-generator/completions.txt @@ -0,0 +1,582 @@ + +That game +I can see +Hmm, this +I can relate to +Who is +I understand the +Ugh, +What the hell was +Hey, did anyone +Although +Thank you for choosing +What are you +Oh w +How dare you open +It was my pleasure +I'm hon +I appreciate that you +Are you k +Whoever left this +It's always +Ew, +Hey, I l +Hello? Is someone +I understand that +That poem +Aww, poor +Hey, it +Alright, who +I didn't +Well, life +The document +Oh no, this +I'm concerned +Hello, this is +This art +Hmm, this drink +Hi there! +It seems +Is +Good +I can't +Ex +Who are +I can see that +Wow, +Today is a +Hey friend +Sometimes friends +Oh, this old +The weather outside +This place is sur +I appreciate your input +Thank you for the +Look at +I'm disappoint +To my +How dare you +That's an +This piece of art +Eww +This park is +This is incredible +Oh no, someone +Exc +Well, it' +I warned +Hey, I understand +Hey, I saw +How dare you go +What the he +Hey +It's +Hello? Hello? +It +Oh no! +This is the perfect +Good morning, +Oh no, there +It's so +Yeah +Uh, +Hello everyone +Who turned off +The weather +Who' +Hey, this +Wait, +Eww, gross +Excuse +It seems like you +Thank you so +What happened? +Oh my g +I am deeply sad +I war +Okay, let' +Hey, that +That was a beautiful +Oh no! That +What happened +Hey there +The artist' +What?! +Hey, it' +I am disappoint +It seems like +Oh no! The +This park is a +If you +Yes! I did +It sounds +What +Who is it +Hmm, that +That's strange +Yeah, that was +That's interesting +This park +What the hell +Who is that +I feel like my +Oh well +What the hell is +Hello? Hello +To my dearest +Bless you!\" +Thank you for +Oh, looks like +Can you please +This place is +Eww, what +Bless you +Is everything +Hey, I just +Whoever left these +Well, that' +I feel +Hey, do you +It's sad +Oh no, it +Hey, that' +Oh my god, +Thank you, +Hello little one, +I apolog +Hey team, I +How dare you read +Who is this and +Whoever left +Hi there! W +A +If you have +I was +U +Bless +Well, this +Oh, I' +It's a +Eww, +Is everything okay? +Oh, I +Hello, can you +Al +That was a great +What are +I understand that not +Oh no, not +Who is it?\" +Hey, can we +Whoever is taking +I would love to +Hey, I noticed +Hey, could +I understand that there +Hello? +D +Oh man, I +Thank you so much +Oh no, my +Dear [Name +Uh +I remember +Hey, who +Well, it +Are you +I understand that it +Hey, is +I would +Who is this +Excuse me +Alright +I am thrilled +Sometimes friends have +Who the +It's interesting +I would love +E +Hello? Is anyone +Well, this is +This place +Well, +I warned you +Hey, watch where +Oh my +That' +Sometimes friends have different +I understand that everyone +What? +What do these notes +I can relate +I'm not +I understand +To my dear +Guys +Well +Hey, I appreciate +Wow, what +Dear +That melody +Who the hell +Today is +Hello little +Wow, look +That's great +Love is never wrong +I'm having +Whoa, did +Ugh +Can you please provide +I miss you, +I feel uncom +I know +Ugh, this +Hey, watch +Oh great, a +I didn +Okay +That game of char +Oh +I appreciate +Who's there +I am so +Oh great, someone +Hey, could you +I remember wondering +Wait, what? +What do +Hello? Can +Hey there, +That game of +This is incred +Oh my gosh +Oh great, f +I appreciate your +It sounds like +What the heck +Okay, I understand +Ew +I understand that this +Uh, hi +Hi everyone! +What the hell? +Thank you for your +Oh no, the +Wow, I +Who turned +Dear [ +Whoever +This is a +Whoa, he +What in the world +Although the physical +Hello, who is +That's amaz +Hey, I know +Okay, that +Hi everyone +Hey, is everything +I understand your fr +Oh no, poor +Oh, look +Good morning +Ew, gross +Oh no, did +Look at the family +Hey team +Yes! +Hey, can I +Okay, that' +It's great +Love is +Hey, what +Good morning, world +Who is it? +That poem really reson +I +That's +I understand the task +Gu +Hello? Who' +This postcard is +Whoa, +Oh, that +I understand that I +Whoever is +Hello? Who is +I'm really +Wow, this +Can +This artwork really +This is a shame +I miss you too +Who are you? +Today is a difficult +Hey, just +Are you okay +I am +Hi, +Wow, that +Hey there! Can +Okay, stay +Oh great, just +Yeah, +Hello? Can you +Oh, looks +Thank you for sharing +I'm glad +Hey, is that +Hmm +It was my +It sounds like you +Wow, your +I was promised certain +That was such a +Thank +Excuse you +That was +Hey team, +I feel un +It was +What' +Hey friend, I +How +Saying goodbye +That +It's heart +How dare +Oh, +Hello, may +What's this +Thank you for recogn +Aww, that +Oh, I remember +Hmm, that' +I miss +I know this +Wait +Is everything okay +Who is that person +Wow, you +Oh great +I'm sad +Wow, the +I am very disappoint +Who turned off the +I understand that things +I'm very +Hi +That's very +Okay, I +Oh no, +Wow, there +What's wrong +I apologize for +Hey, I +Can I help you +Oh, I didn +Alright, +Oh wow, +Oh my goodness +I know this event +What in the +Saying +Yeah, that +Guys, I +Hey, this v +This post +Are +Hey, can +Hello? Is +I can only imagine +Oh, that sounds +Hey, is anyone +I am disappointed +Hello, +Hey everyone, I +That was such +It's okay +The artist +Whoa +I understand that mistakes +Can I help +Who +Hi everyone! I +Hey, can you +Wow, how +Today +Oh no, I +Oh well, I +Well, that +This is the +Yes! I finally +Hey there little +Hello everyone! +Love is never +Look at the +This postcard +Oh great, +Can I +Hmm, this is +I understand your +Oh, look at +B +I'm so +Whoa, this +W +Oh, this +Sometimes +This piece of +What the +That was a +Hey, do +Oh no +Whoa, what +I feel like I +The documentary +Hello +Hello little one +I understand that my +Eww, that +Wow, an +Yes! Finally, +Although the physical location +Whoever is watching +That movie +I remember wondering about +Hey there, little +Who's +Hello, who +Hello everyone! Thank +Hello, can +That's too +Hey, just wanted +Hey there, I +Saying good +Hey there! +Who is there? +Oh my good +I am very +Oh no, what +Wow, thank +I was promised +Hi, is +Hey, I' +Guys, the +Oh no, that +Who is there +Hello, this +That movie really touched +If you have something +The documentary was +I'm starting +Are you kidd +That movie really +Hey everyone, +Thank you for considering +I didn' +Yes! I +Can you +Oh my god +Hey, whoever +That melody really +Thank you, little +Hello, may I +Look +Wow, we +It looks +What do these +Oh wow +I apologize +What are you all +It's such +It's clear +Hey, I was +Hey friend, +I can only +The weather outside is +Eww, this +I miss you +Wow +Aww, +Hi, is there +This artwork +Okay, +Oh well, +This +I' +Say +Hey there little gu +Hmm, +Whoa, who +I am thr +Oh man +Okay, stay calm +I'm happy +Oh, this cur +Oh man, +I'm sorry +Hello? Who +What?! That +This piece +Hey everyone +That's so +Are you okay? +What happened? Where +Hi there +The +Who the hell entered +I can +Guys, +What's +What in +It's important +I'm +I'm coming +It' +Yes! Finally +Wait, what +Wow, reading +I'm surprised +Hey, did +Hey, +Okay, let +I understand that you +Who the hell threw +Eww, who +Thank you for thinking +Who is this?\" +I am deeply +Thank you for including +Oh no, an +It looks like you +Aww +I'm confused +Wow, it +That poem really +Yes +Hey there, is +Hey, what' +Thank you for remember +To +This is +Thank you for making +I can' +That mel +Wow, they +I feel like +Although the +Who are you +Love +If +What the hell are +I am so sad +Oh, I found +Thank you +It looks like +Well, life is +I appreciate that +The artist's +Whoa, that +It's never \ No newline at end of file diff --git a/llama.cpp/tools/cvector-generator/cvector-generator.cpp b/llama.cpp/tools/cvector-generator/cvector-generator.cpp new file mode 100644 index 0000000..3ba7c52 --- /dev/null +++ b/llama.cpp/tools/cvector-generator/cvector-generator.cpp @@ -0,0 +1,508 @@ +#include "ggml.h" +#include "gguf.h" + +#include "arg.h" +#include "common.h" +#include "llama.h" +#include "pca.hpp" +#include "mean.hpp" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +////////////////////////////////////////////////// +// utils + +template +static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { + std::string ret; + for (; begin != end; ++begin) { + ret += common_token_to_piece(ctx, *begin); + } + + return ret; +} + +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); + printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); + printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]); + printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]); + printf("\n"); +} + +////////////////////////////////////////////////// + + +// cb_eval is reused for each pair of positive - negative prompt +struct callback_data { + ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered + + int n_layers = 0; + int n_tokens = 0; + bool is_eval_pos = true; + + // each element of the vector correspond to one layer + std::vector v_pos; // vector of matrices of size [n_embd, n_tokens] + std::vector v_neg; // vector of matrices of size [n_embd, n_tokens] + std::vector v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer + + // save a tensor into either v_pos or v_neg (decided by is_eval_pos) + void save_tensor_for_layer(struct ggml_tensor * t) { + GGML_ASSERT(t->type == GGML_TYPE_F32); + + if (ctx_ggml == nullptr) { + // alloc a new ctx_ggml if needed + struct ggml_init_params params_ggml = { + /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_ggml = ggml_init(params_ggml); + } + + // copy tensor data + auto n_bytes = ggml_nbytes(t); + struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]); + t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow + ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); + ggml_set_name(t_layer, ggml_get_name(t)); + //print_debug_tensor(t_layer); + + if (is_eval_pos) { + v_pos.push_back(t_layer); + } else { + v_neg.push_back(t_layer); + } + } + + // calculate diff (v_pos - v_neg) and place the result back to v_pos + // all zero rows in the diff tensor will also be removed + // NOTE: final layer is ignored. we only have (n_layers - 1) to process + std::vector calc_diff() { + for (float il = 0; il < v_pos.size(); il++) { + float * a = (float *) v_pos[il]->data; + float * b = (float *) v_neg[il]->data; + size_t n_elem = ggml_nelements(v_pos[il]); + for (size_t j = 0; j < n_elem; j++) { + a[j] -= b[j]; + } + //print_debug_tensor(v_pos[i]); + auto diff_filtered = filter_nonzero_rows(v_pos[il]); + v_diff_filtered.push_back(diff_filtered); + } + return v_diff_filtered; // for convinient, we return the result std::vector + } + + // delete zero rows from a given 2D tensor + struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { + //printf("filter_nonzero_rows\n"); + auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { + // check if given row containing all zero elements + int n_cols = t->ne[0]; // hint: should be equal to n_embd + for (int col = 0; col < n_cols; ++col) { + if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) { + return false; + } + } + return true; + }; + std::vector rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered) + for (int i_row = 0; i_row < a->ne[1]; i_row++) { + if (!is_row_all_zeros(a, i_row, 1e-6)) { + rows_to_copy.push_back(i_row); + } + } + + // get "n_nonzero_rows" for the output "diff_filtered" + int n_nonzero_rows = rows_to_copy.size(); + //printf("n_nonzero_rows: %d\n", n_nonzero_rows); + int n_embd = a->ne[0]; + GGML_ASSERT(n_nonzero_rows > 0); + + // diff_filtered: [n_embd, n_nonzero_rows] + struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( + ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); + ggml_format_name(diff_filtered, "diff_filtered_%s", a->name); + diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); + + // copy non-zero rows + for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { + int src_row = rows_to_copy[dest_row]; + for (int i = 0; i < n_embd; i++) { + float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0); + ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem); + } + } + + //print_debug_tensor(diff_filtered); + + return diff_filtered; + } + + // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors + void reset() { + for (auto ptr : v_pos) free(ptr->data); + for (auto ptr : v_neg) free(ptr->data); + for (auto ptr : v_diff_filtered) free(ptr->data); + v_pos.clear(); + v_neg.clear(); + v_diff_filtered.clear(); + if (ctx_ggml) { + ggml_free(ctx_ggml); + } + ctx_ggml = nullptr; + } +}; + +/** + * process_ctx is used to store the ggml context for pre-post processing the diff vectors + * in short, input => v_diff and output => v_final + */ +struct train_context { + ggml_context * ctx_ggml; + int n_embd; + int n_layers; + + /* pair of prompts to be used for generating final vector */ + std::vector positive_entries; + std::vector negative_entries; + + // each element of the vector correspond to one layer + // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here + // NOTE (2): v_diff is transposed from v_diff_tmp + std::vector v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) + std::vector v_final; // vector of vectors of size [n_embd] to be written to file + + // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor + // v_diff_tmp will get converted unto v_diff later on + std::vector> v_diff_tmp; + + train_context(int n_embd_, int n_layers_) { + n_embd = n_embd_; + n_layers = n_layers_; + struct ggml_init_params params_ggml = { + /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_ggml = ggml_init(params_ggml); + for (int il = 0; il < n_layers - 1; il++) { + std::vector empty; + v_diff_tmp.push_back(empty); + auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); + t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible + v_final.push_back(t); + } + } + + // add new rows into existing tensor in v_diff_tmp + void concat_diff_tmp(const std::vector & diff_filtered) { + GGML_ASSERT((int) diff_filtered.size() == n_layers - 1); + for (int il = 0; il < n_layers - 1; il++) { + auto t = diff_filtered[il]; + auto & diff_tmp = v_diff_tmp[il]; + size_t curr_size = diff_tmp.size(); + diff_tmp.resize(curr_size + ggml_nbytes(t)); + memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t)); + } + } + + // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) + // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method + void build_v_diff(bool transpose) { + printf("build_v_diff\n"); + for (int il = 0; il < n_layers - 1; il++) { + auto & diff_tmp = v_diff_tmp[il]; + int n_elem = diff_tmp.size() / sizeof(float); + GGML_ASSERT(n_elem % n_embd == 0); + int n_rows = n_elem / n_embd; + struct ggml_tensor * diff = transpose + ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd) + : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows); + ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); + diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible + if (transpose) { + // copy data & transpose + float * arr = (float *) diff_tmp.data(); + for (int ir = 0; ir < n_rows; ++ir) { + for (int ic = 0; ic < n_embd; ++ic) { + float f = arr[ir*n_embd + ic]; + ggml_set_f32_nd(diff, ir, ic, 0, 0, f); + } + } + } else { + // only copy + memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff)); + } + v_diff.push_back(diff); + print_debug_tensor(diff); + // free memory of diff_tmp + diff_tmp.resize(0); + } + } + + ~train_context() { + for (auto ptr : v_final) free(ptr->data); + for (auto ptr : v_diff) free(ptr->data); + // no need to free v_diff_tmp, since we didn't use malloc + ggml_free(ctx_ggml); + } +}; + +struct tokenized_prompt { + std::vector tokens_pos; + std::vector tokens_neg; + size_t max_seq_len; + + tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + const bool add_bos = llama_vocab_get_add_bos(vocab); + tokens_pos = common_tokenize(ctx, pos, add_bos, true); + tokens_neg = common_tokenize(ctx, neg, add_bos, true); + max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); + padding_seq(ctx, tokens_pos, max_seq_len); + padding_seq(ctx, tokens_neg, max_seq_len); + } + + void padding_seq(llama_context * ctx, std::vector & tokens, size_t len) { + // TODO: customize padding token + std::vector pad_tokens = common_tokenize(ctx, " ", false); + llama_token pad_tok = pad_tokens.back(); + while (tokens.size() < len) { + tokens.push_back(pad_tok); + } + } +}; + +////////////////////////////////////////////////// + +template +static std::string to_string(const T & val) { + std::stringstream ss; + ss << val; + return ss.str(); +} + +static std::vector ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) { + std::vector output; + std::ifstream file(path); + if (!file.is_open()) { + fprintf(stderr, "error: unable to open file: %s\n", path.c_str()); + exit(1); + } + std::string line; + while (std::getline(file, line)) { + bool is_skip = skip_empty_lines && line.empty(); + if (!is_skip) { + string_process_escapes(line); + output.push_back(line); + } + } + file.close(); + return output; +} + +////////////////////////////////////////////////// + +static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { + auto * cb_data = (callback_data *) user_data; + static const char * l_out_name = "l_out"; + const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; + + if (ask) { + return is_l_out; + } + + if (!is_l_out || t->ne[1] != cb_data->n_tokens) { + return true; + } + + // save the tensor to current context + cb_data->save_tensor_for_layer(t); + return true; +} + +static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { + llama_memory_clear(llama_get_memory(ctx), true); + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + return true; +} + +static void export_gguf(const std::vector & v_ctrl, const std::string fname, const std::string model_hint) { + struct gguf_context * ctx = gguf_init_empty(); + + const std::string arch = "controlvector"; + gguf_set_val_str(ctx, "general.architecture", arch.c_str()); + gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); + gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size()); + + for (size_t i = 0; i < v_ctrl.size(); ++i) { + gguf_add_tensor(ctx, v_ctrl[i]); + print_debug_tensor(v_ctrl[i]); + printf("Added tensor: %s\n", v_ctrl[i]->name); + } + + printf("%s: writing file...\n", __func__); + gguf_write_to_file(ctx, fname.c_str(), false); + printf("%s: wrote file '%s'\n", __func__, fname.c_str()); + gguf_free(ctx); +} + +/** + * Load prompt files and completion file. + * Then format each pair of prompt + completion to make an entry. + */ +static int prepare_entries(common_params & params, train_context & ctx_train) { + // load prompts + std::vector positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); + std::vector negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); + if (positive_prompts.size() != negative_prompts.size()) { + fprintf(stderr, "number of positive and negative prompts must be equal\n"); + return 1; + } + if (positive_prompts.empty()) { + fprintf(stderr, "must provide at least one prompt pair\n"); + return 1; + } + ctx_train.positive_entries = positive_prompts; + ctx_train.negative_entries = negative_prompts; + return 0; +} + +int main(int argc, char ** argv) { + common_params params; + + params.out_file = "control_vector.gguf"; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { + return 1; + } + + if (params.n_pca_iterations % params.n_pca_batch != 0) { + fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n"); + return 1; + } + + + callback_data cb_data; + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = cb_eval; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + print_build_info(); + llama_backend_init(); + llama_numa_init(params.numa); + + // load the model to get hparams + auto llama_init = common_init_from_params(params); + + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); + + // int n_ctx = llama_n_ctx(ctx); + int n_layers = llama_model_n_layer(model); + int n_embd = llama_model_n_embd(model); + + // get model hint param (a.k.a model arch name) + char model_hint[128]; + llama_model_meta_val_str(model, "general.architecture", model_hint, 128); + + // init train_context + train_context ctx_train(n_embd, n_layers); + + // load and prepare entries for training + prepare_entries(params, ctx_train); + + // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped + std::vector tokenized_prompts; + size_t n_total_tokens = 0; + for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { + tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]); + n_total_tokens += 2 * t.max_seq_len; + tokenized_prompts.push_back(std::move(t)); + } + + std::cout << "n_total_tokens: " << n_total_tokens << std::endl; + + for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { + bool success = false; + tokenized_prompt t = tokenized_prompts[i]; + cb_data.n_layers = n_layers; + cb_data.n_tokens = t.max_seq_len; + + printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", + (int) i+1, (int) ctx_train.positive_entries.size(), + tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), + tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), + (int) t.max_seq_len); + + cb_data.is_eval_pos = true; + success = get_hidden_layers(ctx, t.tokens_pos); + if (!success) break; + + cb_data.is_eval_pos = false; + success = get_hidden_layers(ctx, t.tokens_neg); + if (!success) break; + + // calculate diff and remove all zero rows + auto v_diff_filtered = cb_data.calc_diff(); + + // save & concat the filtered v_diff to ctx_train + ctx_train.concat_diff_tmp(v_diff_filtered); + + // reset for next iteration + cb_data.reset(); + } + + // done with the model, we can now free it to make gain some memory + printf("Done evaluate prompts, unload model...\n"); + + bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; + + // prepare ctx_train for PCA + ctx_train.build_v_diff(use_pca); + + if (use_pca) { + // run PCA + PCA::pca_params pca_params; + pca_params.n_threads = params.cpuparams.n_threads; + pca_params.n_batch = params.n_pca_batch; + pca_params.n_iterations = params.n_pca_iterations; + PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); + } else { + // run mean + mean::run(ctx_train.v_diff, ctx_train.v_final); + } + + // write output vectors to gguf + export_gguf(ctx_train.v_final, params.out_file, model_hint); + + llama_backend_free(); + + return 0; +} diff --git a/llama.cpp/tools/cvector-generator/mean.hpp b/llama.cpp/tools/cvector-generator/mean.hpp new file mode 100644 index 0000000..4eeac1e --- /dev/null +++ b/llama.cpp/tools/cvector-generator/mean.hpp @@ -0,0 +1,48 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include + +namespace mean { + +static void run( + const std::vector & v_input, // shape of v_input[0]: [n_embd, n_samples] + const std::vector & v_output) { + printf("%s: Running mean...\n", __func__); + for (size_t il = 0; il < v_input.size(); ++il) { + // prepare output vector + struct ggml_tensor * ctrl_out = v_output[il]; + ggml_format_name(ctrl_out, "direction.%zu", il+1); + + // calculate mean vector + struct ggml_tensor * t_layer = v_input[il]; + GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd + for (int ic = 0; ic < t_layer->ne[0]; ic++) { + float f = 0.0; + for (int ir = 0; ir < t_layer->ne[1]; ir++) { + f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0); + } + f /= t_layer->ne[1]; + ggml_set_f32_1d(ctrl_out, ic, f); + } + + // normalize output vector + float norm = 0.0; + for (int i = 0; i < ggml_nelements(ctrl_out); i++) { + float f = ggml_get_f32_1d(ctrl_out, i); + norm += f*f; + } + norm = sqrt(norm); + for (int i = 0; i < ggml_nelements(ctrl_out); i++) { + float f = ggml_get_f32_1d(ctrl_out, i); + ggml_set_f32_1d(ctrl_out, i, f / norm); + } + + printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size()); + } +} + +} diff --git a/llama.cpp/tools/cvector-generator/negative.txt b/llama.cpp/tools/cvector-generator/negative.txt new file mode 100644 index 0000000..45b9384 --- /dev/null +++ b/llama.cpp/tools/cvector-generator/negative.txt @@ -0,0 +1,4 @@ +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow +<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me +<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow \ No newline at end of file diff --git a/llama.cpp/tools/cvector-generator/pca.hpp b/llama.cpp/tools/cvector-generator/pca.hpp new file mode 100644 index 0000000..afd3bf6 --- /dev/null +++ b/llama.cpp/tools/cvector-generator/pca.hpp @@ -0,0 +1,315 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#ifdef GGML_USE_CUDA +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include +#include +#include +#include +#include + +#define DEBUG_POS 5 + +static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) { + printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]); + if (!with_data) return; + printf("%s: %s[0] = [", __func__, t->name); + for (size_t i = 0; i <= DEBUG_POS; i++) { + printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0)); + } + printf(" ... ]\n"); +} + +namespace PCA { + +// input params for PCA computations +struct pca_params { + int n_threads = 1; + int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used + int n_iterations = 1000; + float tolerance = 1e-7; + + // for debugging + int i_layer = 0; + int n_layers = 0; +}; + +// result from each iteration +struct pca_result { + struct ggml_tensor * calculated_square = NULL; + std::vector eigenvectors; + std::vector distances; +}; + +struct pca_model { + ggml_backend_t backend = NULL; + ggml_backend_buffer_t buffer; + struct ggml_context * ctx; // context to compute graph on target device + struct ggml_context * ctx_host; // host context to store results + + // tensors on target device + struct ggml_tensor * dev_input; + struct ggml_tensor * dev_square; + struct ggml_tensor * dev_eigenvector; + + pca_model(struct ggml_tensor * t_input) { +#ifdef GGML_USE_CUDA + fprintf(stderr, "%s: using CUDA backend\n", __func__); + backend = ggml_backend_cuda_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + +// TODO: enable Metal support when support for GGML_OP_SQRT is added +// #ifdef GGML_USE_METAL +// fprintf(stderr, "%s: using Metal backend\n", __func__); +// backend = ggml_backend_metal_init(); +// if (!backend) { +// fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); +// } +// #endif + + // if there aren't GPU Backends fallback to CPU backend + if (!backend) { + backend = ggml_backend_cpu_init(); + } + + const int num_tensors = 4; + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx = ggml_init(params); + + auto n_samples = t_input->ne[0]; + auto n_embd = t_input->ne[1]; + + dev_input = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd); + dev_square = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + ggml_set_name(dev_input, "dev_input"); + ggml_set_name(dev_square, "dev_square"); + ggml_set_name(dev_eigenvector, "dev_eigenvector"); + buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input)); + + // initialize eigenvector to random normalized vector + { + std::vector random_vec(ggml_nelements(dev_eigenvector), 0.0); + std::default_random_engine generator(static_cast(std::time(0))); + std::uniform_real_distribution distribution(0.0, 1.0); + float sum_sqr = 0.0; // for normalizing random_vec + for (size_t i = 0; i < random_vec.size(); ++i) { + float f = distribution(generator); + sum_sqr += f * f; + random_vec[i] = f; + } + // normalize it + float random_vec_norm = std::sqrt(sum_sqr); + for (size_t i = 0; i < random_vec.size(); ++i) { + random_vec[i] /= random_vec_norm; + } + ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector)); + } + } + + ~pca_model() { + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); + } +}; + +static struct ggml_cgraph * build_graph_piter( + const struct pca_params & params, + const pca_model & model, + bool calc_square = false) { + GGML_ASSERT(params.n_batch > 0); + // TODO: buf_size must be able to scale with params.n_batch + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + // create a temporally context to build the graph + struct ggml_context * ctx0 = ggml_init(params0); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + // turn v_diff_original into square matrix if needed + struct ggml_tensor * tmp_square; + if (calc_square) { + tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input); + ggml_set_name(tmp_square, "tmp_square"); + } + + struct ggml_tensor * b_tensor; + struct ggml_tensor * distance; + struct ggml_tensor * old_eigen = model.dev_eigenvector; + struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square; + + for (int i = 0; i < params.n_batch; ++i) { + // b_tensor = square * eigenvector^T + b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen); + ggml_set_name(b_tensor, "b_tensor"); + + // normalize + b_tensor = ggml_div_inplace(ctx0, + b_tensor, + ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor))) + ); + ggml_format_name(b_tensor, "b_tensor_norm_%d", i); + + // calculate distance(new eigenvector - old eigenvector) + // we don't use ggml_sub because it may not be implemented on GPU backend + struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1)); + distance = ggml_sqrt_inplace(ctx0, + ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old))); + ggml_format_name(distance, "distance_%d", i); + + old_eigen = b_tensor; + + // build operations nodes + ggml_build_forward_expand(gf, distance); + } + + // delete the temporally context used to build the graph + ggml_free(ctx0); + return gf; +} + +static ggml_status compute_piter( + const struct pca_params & params, + const pca_model & model, + struct ggml_cgraph * gf, + ggml_gallocr_t allocr, + struct pca_result & result) { + // allocate tensors + ggml_gallocr_alloc_graph(allocr, gf); + + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, params.n_threads); + } + + ggml_status res = ggml_backend_graph_compute(model.backend, gf); + if (res == GGML_STATUS_SUCCESS) { + auto extract_i = [](std::string prefix, std::string str) -> int { + int i = -1; + if (str.rfind(prefix, 0) == 0) { + sscanf(str.c_str(), (prefix + "%d").c_str(), &i); + } + return i; + }; + result.calculated_square = NULL; + result.eigenvectors.clear(); + result.distances.clear(); + result.eigenvectors.resize(params.n_batch); + result.distances.resize(params.n_batch); + // get output nodes + for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { + auto node = ggml_graph_node(gf, i); + int iter = -1; + // find b_tensor (without copying data from device) + if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) { + result.eigenvectors[iter] = node; + } + // find distances, then copy data from device + if ((iter = extract_i("distance_", node->name)) > -1) { + float d; + ggml_backend_tensor_get(node, &d, 0, sizeof(float)); + result.distances[iter] = d; + // std::cout << node->name << " = " << d << "\n"; + } + // find tmp_square if it exists (without copying data from device) + if (std::string(node->name) == "tmp_square") { + result.calculated_square = node; + } + } + } + return res; +} + +static void power_iteration( + const struct pca_params & params, + struct ggml_tensor * input, // shape of input: [n_samples, n_embd] + struct ggml_tensor * output) { + //printf("in power iteration\n"); + struct pca_model model(input); + + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + struct pca_result result; + struct ggml_tensor * last_eigenvector = NULL; + + int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations + for (int iter = 0; iter < n_iters; ++iter) { + bool calc_square = (iter == 0); // only need to calculate square for first iteration + struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square); + // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot"); + compute_piter(params, model, gf, allocr, result); + + for (size_t k = 0; k < result.distances.size(); ++k) { + last_eigenvector = result.eigenvectors[k]; + if (result.distances[k] < params.tolerance) { + break; // done + } + } + + if (calc_square) { + // copy and store the square matrix if needed + GGML_ASSERT(result.calculated_square != NULL); + ggml_backend_tensor_copy(result.calculated_square, model.dev_square); + } + + { + // copy last eigen vector and store as input for next iteration + GGML_ASSERT(last_eigenvector != NULL); + ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector); + } + + printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n", + __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch); + } + + // get output tensor + GGML_ASSERT(last_eigenvector); + ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector)); + //print_debug_tensor(output); + ggml_gallocr_free(allocr); + + // TODO @ngxson : The output vector is randomly inverted + // Solution: https://github.com/ggml-org/llama.cpp/pull/8069#issuecomment-2185328171 +} + +static void run_pca( + struct pca_params & params, + const std::vector & v_input, // shape of v_input[0]: [n_samples, n_embd] + const std::vector & v_output) { + printf("%s: Running PCA...\n", __func__); + for (size_t il = 0; il < v_input.size(); ++il) { + + // prepare output vector + struct ggml_tensor * ctrl_out = v_output[il]; + ggml_format_name(ctrl_out, "direction.%zu", il+1); + + // run power_iteration + params.i_layer = il; + params.n_layers = v_input.size(); + power_iteration(params, v_input[il], ctrl_out); + printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size()); + } +} + +} diff --git a/llama.cpp/tools/cvector-generator/positive.txt b/llama.cpp/tools/cvector-generator/positive.txt new file mode 100644 index 0000000..fea7362 --- /dev/null +++ b/llama.cpp/tools/cvector-generator/positive.txt @@ -0,0 +1,4 @@ +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world +<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever! +<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you +<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now! \ No newline at end of file diff --git a/llama.cpp/tools/export-lora/CMakeLists.txt b/llama.cpp/tools/export-lora/CMakeLists.txt new file mode 100644 index 0000000..cddfa77 --- /dev/null +++ b/llama.cpp/tools/export-lora/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-export-lora) +add_executable(${TARGET} export-lora.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/llama.cpp/tools/export-lora/README.md b/llama.cpp/tools/export-lora/README.md new file mode 100644 index 0000000..7dce99c --- /dev/null +++ b/llama.cpp/tools/export-lora/README.md @@ -0,0 +1,33 @@ +# export-lora + +Apply LORA adapters to base model and export the resulting model. + +``` +usage: llama-export-lora [options] + +options: + -m, --model model path from which to load base model (default '') + --lora FNAME path to LoRA adapter (can be repeated to use multiple adapters) + --lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters) + -t, --threads N number of threads to use during computation (default: 4) + -o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf') +``` + +For example: + +```bash +./bin/llama-export-lora \ + -m open-llama-3b-v2.gguf \ + -o open-llama-3b-v2-english2tokipona-chat.gguf \ + --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf +``` + +Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: + +```bash +./bin/llama-export-lora \ + -m your_base_model.gguf \ + -o your_merged_model.gguf \ + --lora-scaled lora_task_A.gguf 0.5 \ + --lora-scaled lora_task_B.gguf 0.5 +``` diff --git a/llama.cpp/tools/export-lora/export-lora.cpp b/llama.cpp/tools/export-lora/export-lora.cpp new file mode 100644 index 0000000..41f4262 --- /dev/null +++ b/llama.cpp/tools/export-lora/export-lora.cpp @@ -0,0 +1,434 @@ +#include "ggml.h" +#include "ggml-alloc.h" +#include "gguf.h" + +#include "arg.h" +#include "common.h" + +#include +#include +#include +#include + +static bool g_verbose = false; + +struct tensor_transformation { + struct ggml_tensor * in; + struct ggml_tensor * out; + bool is_copy; +}; + +static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); +} + +static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) { + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +static std::string ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ ctx_ggml, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); + if (!ctx_gguf) { + throw std::runtime_error("failed to load input GGUF from " + fname); + } + return ctx_gguf; +} + +struct file_input { + struct ggml_context * ctx_meta = nullptr; + struct gguf_context * ctx_gguf = nullptr; + std::ifstream f_in; + std::map tensors; + float alpha; + float scale; + + file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) { + if (!f_in.is_open()) { + throw std::runtime_error("failed to open input gguf from " + fname); + } + + ctx_gguf = load_gguf(fname, &ctx_meta); + alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha"); + printf("%s: loaded gguf from %s\n", __func__, fname.c_str()); + + for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) { + std::string name(cur->name); + tensors[name] = cur; + if (g_verbose) { + printf("%s: %s\n", __func__, cur->name); + } + } + } + + ggml_tensor * get_tensor(std::string name) { + if (tensors.find(name) == tensors.end()) { + return nullptr; + } + return tensors[name]; + } + + void read_tensor_data(std::string name, std::vector & buf) { + if (tensors.find(name) == tensors.end()) { + throw std::runtime_error("cannot find tensor with name: " + name); + } + auto len = ggml_nbytes(tensors[name]); + if (buf.size() < len) { + buf.resize(len); + } + auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + f_in.seekg(offset); + f_in.read((char* )buf.data(), len); + } + + ~file_input() { + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + } +}; + +struct lora_merge_ctx { + // input base model + adapters + file_input base_model; + std::vector> adapters; + + // for computing merged tensor + int n_threads; + ggml_backend_t backend = nullptr; + ggml_gallocr_t allocr = nullptr; + std::vector read_buf; + + // output file + struct gguf_context * ctx_out; + struct ggml_context * ctx_out_ggml; + std::ofstream fout; + + lora_merge_ctx( + std::string & base_fname, + std::vector & lora_files, + std::string & outfile, + int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) { + throw std::runtime_error("split model is not yet supported"); + } + + for (auto & lora_inp : lora_files) { + auto fname = lora_inp.path; + auto scale = lora_inp.scale; + std::unique_ptr adapter(new file_input(fname, scale)); + check_metadata_lora(adapter.get()); + adapters.push_back(std::move(adapter)); + } + + ctx_out = gguf_init_empty(); + struct ggml_init_params params = { + /*.mem_size =*/ static_cast(gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead()), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_out_ggml = ggml_init(params); + backend = ggml_backend_cpu_init(); + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + } + + void check_metadata_lora(file_input * adapter) { + auto general_type = get_kv_str(adapter->ctx_gguf, "general.type"); + if (general_type != "adapter") { + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); + } + + auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type"); + if (adapter_type != "lora") { + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); + } + + auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture"); + auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture"); + if (general_arch_base != general_arch_lora) { + throw std::runtime_error("model arch and LoRA arch mismatch"); + } + } + + ggml_type get_out_tensor_type(struct ggml_tensor * t) { + if (t->type == GGML_TYPE_F32) { + return GGML_TYPE_F32; + } else { + return GGML_TYPE_F16; + } + } + + void run_merge() { + // prepare metadata + gguf_set_kv(ctx_out, base_model.ctx_gguf); + // output is forced to f16 for now + gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16); + + // check if all lora adapters have the same tensors + // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggml-org/llama.cpp/pull/8607#discussion_r1686027777 + static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once."; + if (adapters.size() > 1) { + for (size_t i = 1; i < adapters.size(); ++i) { + if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) { + throw std::runtime_error(err_no_subset_adapter); + } + for (auto & it : adapters[i]->tensors) { + if (adapters[0]->get_tensor(it.first) == nullptr) { + throw std::runtime_error(err_no_subset_adapter); + } + } + } + } + + // mapping base tensor to out tensor (same shape with base, but different type) + std::vector trans; + for (auto & it : base_model.tensors) { + bool t_a = true; + bool t_b = true; + for (auto & adapter : adapters) { + t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a"); + t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b"); + } + auto base_tensor = it.second; + if (!t_a && !t_b) { + // only copy + struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + ggml_set_name(cpy_tensor, base_tensor->name); + trans.push_back({ + cpy_tensor, + cpy_tensor, + true, + }); + gguf_add_tensor(ctx_out, cpy_tensor); + } else if (t_a && t_b) { + // need merging + struct ggml_tensor * out_tensor = ggml_new_tensor( + ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); + ggml_set_name(out_tensor, base_tensor->name); + trans.push_back({ + base_tensor, + out_tensor, + false, + }); + gguf_add_tensor(ctx_out, out_tensor); + } else { + throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); + } + } + + // placeholder for the meta data + { + size_t meta_size = gguf_get_meta_size(ctx_out); + zeros(fout, meta_size); + } + + // process base model tensors + size_t n_merged = 0; + for (auto & it : trans) { + if (!it.is_copy) { + merge_tensor(it.in, it.out); + n_merged++; + } else { + copy_tensor(it.in); + } + } + + // write output metadata + { + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.seekp(0); + fout.write((const char *)data.data(), data.size()); + } + + printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged); + printf("%s : wrote %zu tensors to output file\n", __func__, trans.size()); + } + + void copy_tensor(struct ggml_tensor * base) { + printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); + size_t len = ggml_nbytes(base); + base_model.read_tensor_data(base->name, read_buf); + fout.write((char* )read_buf.data(), len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); + } + + void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) { + std::string name_base(base->name); + std::string name_lora_a = name_base + ".lora_a"; + std::string name_lora_b = name_base + ".lora_b"; + + printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); + + // context for input tensor + std::vector inp_a(adapters.size()); + std::vector inp_b(adapters.size()); + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + + // alloc tensors + struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne); + for (size_t i = 0; i < adapters.size(); ++i) { + auto t_a = adapters[i]->get_tensor(name_lora_a); + auto t_b = adapters[i]->get_tensor(name_lora_b); + // TODO: add support for quantized lora + if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) { + throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32"); + } + inp_a[i] = ggml_dup_tensor(ctx, t_a); + inp_b[i] = ggml_dup_tensor(ctx, t_b); + } + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + // load base tensor to backend buffer + base_model.read_tensor_data(name_base, read_buf); + if (base->type != GGML_TYPE_F32) { + // optionally dequantize it + printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); + auto nels = ggml_nelements(inp_base); + const auto * qtype = ggml_get_type_traits(base->type); + std::vector dequant_buf(nels * sizeof(float)); + qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); + } else { + ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); + } + + // load lora tensors to backend buffer + for (size_t i = 0; i < adapters.size(); ++i) { + adapters[i]->read_tensor_data(name_lora_a, read_buf); + ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i])); + adapters[i]->read_tensor_data(name_lora_b, read_buf); + ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i])); + } + + // build graph + struct ggml_cgraph * gf; + { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx0 = ggml_init(params0); + gf = ggml_new_graph(ctx0); + struct ggml_tensor * cur = inp_base; + for (size_t i = 0; i < adapters.size(); ++i) { + struct ggml_tensor * delta; + bool is_tok_embd = string_starts_with(name_base, "token_embd"); + if (is_tok_embd) { + printf("%s : detected token embeddings tensor\n", __func__); + delta = ggml_mul_mat(ctx0, + ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32), + ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)); + } else { + delta = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))), + ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32)); + } + // scale + const float alpha = adapters[i]->alpha; + const float rank = (float) inp_b[i]->ne[0]; + const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; + delta = ggml_scale(ctx0, delta, scale); + cur = ggml_add(ctx0, delta, cur); + printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); + printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); + } + cur = ggml_cast(ctx0, cur, out->type); + printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type)); + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + } + + // compute + { + ggml_gallocr_alloc_graph(allocr, gf); + ggml_backend_cpu_set_n_threads(backend, n_threads); + ggml_backend_graph_compute(backend, gf); + } + + // write data to output file + { + auto * result = ggml_graph_node(gf, -1); + size_t len = ggml_nbytes(result); + if (read_buf.size() < len) { + read_buf.resize(len); + } + ggml_backend_tensor_get(result, read_buf.data(), 0, len); + fout.write((char* )read_buf.data(), len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); + } + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + } + + ~lora_merge_ctx() { + ggml_gallocr_free(allocr); + ggml_backend_free(backend); + gguf_free(ctx_out); + ggml_free(ctx_out_ggml); + } +}; + +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); + printf("\nNOTE: output model is F16\n"); + printf("\n"); +} + +int main(int argc, char ** argv) { + common_params params; + + params.out_file = "ggml-lora-merged-f16.gguf"; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { + return 1; + } + + g_verbose = (params.verbosity > 1); + try { + lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads); + ctx.run_merge(); + } catch (const std::exception & err) { + fprintf(stderr, "%s\n", err.what()); + exit(EXIT_FAILURE); + } + + printf("done, output file is %s\n", params.out_file.c_str()); + + return 0; +} diff --git a/llama.cpp/tools/fit-params/CMakeLists.txt b/llama.cpp/tools/fit-params/CMakeLists.txt new file mode 100644 index 0000000..34c3373 --- /dev/null +++ b/llama.cpp/tools/fit-params/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-fit-params) +add_executable(${TARGET} fit-params.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/llama.cpp/tools/fit-params/README.md b/llama.cpp/tools/fit-params/README.md new file mode 100644 index 0000000..8f0c958 --- /dev/null +++ b/llama.cpp/tools/fit-params/README.md @@ -0,0 +1,55 @@ +# fit-params + +llama.cpp binaries can automatically fit the projected memory use of a model to the free device memory available at runtime, +this is controlled using the CLI arguments starting with `-fit`/`--fit`. +Internally the code is calling `llama_params_fit` to adjust the `llama_model_params` and `llama_context_params` structs. +`llama-fit-params` is a simple utility that prints the CLI arguments corresponding to these adjustments to stdout. +Example usage: + +``` bash +# First, run llama-fit-params and store the results in a file: +> ./build/bin/llama-fit-params --model /opt/models/qwen_3-30b3a-f16.gguf | tee args.txt +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes +build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu +llama_params_fit_impl: projected to use 61807 MiB of device memory vs. 24077 MiB of free device memory +llama_params_fit_impl: cannot fulfill margin of 1024 MiB, need to reduce device memory by 42444 MiB +llama_params_fit_impl: context size reduced from 40960 to 4096 -> need 3456 MiB less memory in total +llama_params_fit_impl: with only dense weights in device memory there is a total surplus of 16164 MiB +llama_params_fit_impl: distributing layers across devices with overflow to next device/system memory: +llama_params_fit_impl: - CUDA0 (NVIDIA GeForce RTX 4090): 48 layers (34 overflowing), 19187 MiB used, 1199 MiB free +llama_params_fit: successfully fit params to free device memory +llama_params_fit: fitting params to free memory took 1.15 seconds +Printing fitted CLI arguments to stdout... +-c 4096 -ngl 48 -ot blk\.14\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.15\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.16\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.17\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.18\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.19\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.20\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.21\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.22\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.23\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.24\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.25\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.26\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.27\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.28\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.29\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.30\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.31\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.32\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.33\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.34\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.35\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.36\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.37\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.38\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.39\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.40\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.41\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.42\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.43\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.44\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.45\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.46\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.47\.ffn_(up|down|gate)_(ch|)exps=CPU + +# Next, use those results for a llama.cpp binary: +> cat args.txt | xargs ./build/bin/llama-server --model /opt/models/qwen_3-30b3a-f16.gguf +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes +build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu +system info: n_threads = 16, n_threads_batch = 16, total_threads = 32 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +main: binding port with default address family +main: HTTP server is listening, hostname: 127.0.0.1, port: 8080, http threads: 31 +main: loading model +srv load_model: loading model '/opt/models/qwen_3-30b3a-f16.gguf' +llama_params_fit_impl: projected to use 19187 MiB of device memory vs. 24077 MiB of free device memory +llama_params_fit_impl: will leave 1199 >= 1024 MiB of free device memory, no changes needed +llama_params_fit: successfully fit params to free device memory +llama_params_fit: fitting params to free memory took 0.28 seconds +[...] +main: server is listening on http://127.0.0.1:8080 - starting the main loop +srv update_slots: all slots are idle +^Csrv operator(): operator(): cleaning up before exit... + +llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | +llama_memory_breakdown_print: | - CUDA0 (RTX 4090) | 24077 = 945 + (19187 = 17904 + 384 + 898) + 3945 | +llama_memory_breakdown_print: | - Host | 58271 = 58259 + 0 + 12 | +``` diff --git a/llama.cpp/tools/fit-params/fit-params.cpp b/llama.cpp/tools/fit-params/fit-params.cpp new file mode 100644 index 0000000..0176be0 --- /dev/null +++ b/llama.cpp/tools/fit-params/fit-params.cpp @@ -0,0 +1,66 @@ +#include "llama.h" + +#include "arg.h" +#include "common.h" +#include "log.h" + +#include +#include +#include + +using namespace std::chrono_literals; + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +int main(int argc, char ** argv) { + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + return 1; + } + + common_init(); + llama_backend_init(); + llama_numa_init(params.numa); + auto mparams = common_model_params_to_llama(params); + auto cparams = common_context_params_to_llama(params); + const llama_params_fit_status status = llama_params_fit(params.model.path.c_str(), &mparams, &cparams, + params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, + params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + if (status != LLAMA_PARAMS_FIT_STATUS_SUCCESS) { + LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__); + exit(1); + } + + LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__); + common_log_flush(common_log_main()); + printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers); + + size_t nd = llama_max_devices(); + while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) { + nd--; + } + if (nd > 1) { + for (size_t id = 0; id < nd; id++) { + if (id == 0) { + printf(" -ts "); + } + printf("%s%" PRIu32, id > 0 ? "," : "", uint32_t(mparams.tensor_split[id])); + } + } + + const size_t ntbo = llama_max_tensor_buft_overrides(); + bool any_tbo = false; + for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) { + if (itbo == 0) { + printf(" -ot \""); + } + printf("%s%s=%s", itbo > 0 ? "," : "", mparams.tensor_buft_overrides[itbo].pattern, ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft)); + any_tbo = true; + } + printf("%s\n", any_tbo ? "\"" : ""); + + return 0; +} diff --git a/llama.cpp/tools/gguf-split/CMakeLists.txt b/llama.cpp/tools/gguf-split/CMakeLists.txt new file mode 100644 index 0000000..9b21250 --- /dev/null +++ b/llama.cpp/tools/gguf-split/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-gguf-split) +add_executable(${TARGET} gguf-split.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/llama.cpp/tools/gguf-split/README.md b/llama.cpp/tools/gguf-split/README.md new file mode 100644 index 0000000..ad1d866 --- /dev/null +++ b/llama.cpp/tools/gguf-split/README.md @@ -0,0 +1,10 @@ +## GGUF split Example + +CLI to split / merge GGUF files. + +**Command line options:** + +- `--split`: split GGUF to multiple GGUF, default operation. +- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`. +- `--split-max-tensors`: maximum tensors in each split: default(128) +- `--merge`: merge multiple GGUF to a single GGUF. diff --git a/llama.cpp/tools/gguf-split/gguf-split.cpp b/llama.cpp/tools/gguf-split/gguf-split.cpp new file mode 100644 index 0000000..30e7715 --- /dev/null +++ b/llama.cpp/tools/gguf-split/gguf-split.cpp @@ -0,0 +1,583 @@ +#include "ggml.h" +#include "gguf.h" +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_WIN32) + #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif + #include +#endif + +enum split_operation : uint8_t { + OP_NONE, + OP_SPLIT, + OP_MERGE, +}; + +enum split_mode : uint8_t { + MODE_NONE, + MODE_TENSOR, + MODE_SIZE, +}; + +struct split_params { + split_operation operation = OP_NONE; + split_mode mode = MODE_NONE; + size_t n_bytes_split = 0; + int n_split_tensors = 128; + std::string input; + std::string output; + bool no_tensor_first_split = false; + bool dry_run = false; +}; + +static void split_print_usage(const char * executable) { + const split_params default_params; + printf("\n"); + printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable); + printf("\n"); + printf("Apply a GGUF operation on IN to OUT."); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --version show version and build info\n"); + printf(" --split split GGUF to multiple GGUF (enabled by default)\n"); + printf(" --merge merge multiple GGUF to a single GGUF\n"); + printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); + printf(" --split-max-size N(M|G) max size per split\n"); + printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n"); + printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); + printf("\n"); +} + +// return convert string, for example "128M" or "4G" to number of bytes +static size_t split_str_to_n_bytes(std::string str) { + size_t n_bytes = 0; + int n; + if (str.back() == 'M') { + sscanf(str.c_str(), "%d", &n); + n_bytes = (size_t)n * 1000 * 1000; // megabytes + } else if (str.back() == 'G') { + sscanf(str.c_str(), "%d", &n); + n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes + } else { + throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); + } + if (n <= 0) { + throw std::invalid_argument("error: size must be a positive value"); + } + return n_bytes; +} + +static void split_params_parse_ex(int argc, const char ** argv, split_params & params) { + std::string arg; + const std::string arg_prefix = "--"; + bool invalid_param = false; + + int arg_idx = 1; + for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { + arg = argv[arg_idx]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + bool arg_found = false; + if (arg == "-h" || arg == "--help") { + split_print_usage(argv[0]); + exit(0); + } else if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } else if (arg == "--dry-run") { + arg_found = true; + params.dry_run = true; + } else if (arg == "--no-tensor-first-split") { + arg_found = true; + params.no_tensor_first_split = true; + } else if (arg == "--merge") { + arg_found = true; + if (params.operation != OP_NONE && params.operation != OP_MERGE) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } + params.operation = OP_MERGE; + } else if (arg == "--split") { + arg_found = true; + if (params.operation != OP_NONE && params.operation != OP_SPLIT) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } + params.operation = OP_SPLIT; + } else if (arg == "--split-max-tensors") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } + params.mode = MODE_TENSOR; + params.n_split_tensors = atoi(argv[arg_idx]); + } else if (arg == "--split-max-size") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + if (params.mode != MODE_NONE && params.mode != MODE_SIZE) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } + params.mode = MODE_SIZE; + params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); + } + + if (!arg_found) { + throw std::invalid_argument("error: unknown argument: " + arg); + } + } + + // the operation is split if not specified + if (params.operation == OP_NONE) { + params.operation = OP_SPLIT; + } + // the split mode is by tensor if not specified + if (params.mode == MODE_NONE) { + params.mode = MODE_TENSOR; + } + + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } + + if (argc - arg_idx != 2) { + throw std::invalid_argument("error: bad arguments"); + } + + params.input = argv[arg_idx++]; + params.output = argv[arg_idx++]; +} + +static bool split_params_parse(int argc, const char ** argv, split_params & params) { + bool result = true; + try { + split_params_parse_ex(argc, argv, params); + } + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + split_print_usage(argv[0]); + exit(EXIT_FAILURE); + } + return result; +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +struct split_strategy { + const split_params params; + std::ifstream & f_input; + struct gguf_context * ctx_gguf; + struct ggml_context * ctx_meta = NULL; + const int n_tensors; + + // one ctx_out per one output file + std::vector ctx_outs; + + // temporary buffer for reading in tensor data + std::vector read_buf; + + split_strategy(const split_params & params, + std::ifstream & f_input, + struct gguf_context * ctx_gguf, + struct ggml_context * ctx_meta) : + params(params), + f_input(f_input), + ctx_gguf(ctx_gguf), + ctx_meta(ctx_meta), + n_tensors(gguf_get_n_tensors(ctx_gguf)) { + + // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits + int i_split = -1; + struct gguf_context * ctx_out = NULL; + auto new_ctx_out = [&](bool allow_no_tensors) { + i_split++; + if (ctx_out != NULL) { + if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) { + fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); + exit(EXIT_FAILURE); + } + ctx_outs.push_back(ctx_out); + } + ctx_out = gguf_init_empty(); + // Save all metadata in first split only + if (i_split == 0) { + gguf_set_kv(ctx_out, ctx_gguf); + } + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder + gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors); + }; + + // initialize ctx_out for the first split + new_ctx_out(false); + + // skip first split if no_tensor_first_split is set + if (params.no_tensor_first_split) { + new_ctx_out(true); + } + + // process tensors one by one + size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) + for (int i = 0; i < n_tensors; ++i) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + // calculate the "imaginary" size = the current size + next tensor size + size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); + size_t next_tensors_size = curr_tensors_size + n_bytes; + if (should_split(i, next_tensors_size)) { + new_ctx_out(false); + curr_tensors_size = n_bytes; + } else { + curr_tensors_size = next_tensors_size; + } + gguf_add_tensor(ctx_out, t); + } + + // push the last ctx_out + ctx_outs.push_back(ctx_out); + + // set the correct n_split for all ctx_out + for (auto & ctx : ctx_outs) { + gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size()); + } + } + + ~split_strategy() { + for (auto & ctx_out : ctx_outs) { + gguf_free(ctx_out); + } + } + + bool should_split(int i_tensor, size_t next_size) { + if (params.mode == MODE_SIZE) { + // split by max size per file + return next_size > params.n_bytes_split; + } else if (params.mode == MODE_TENSOR) { + // split by number of tensors per file + return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; + } + // should never happen + GGML_ABORT("invalid mode"); + } + + void print_info() { + printf("n_split: %zu\n", ctx_outs.size()); + int i_split = 0; + for (auto & ctx_out : ctx_outs) { + // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) + size_t total_size = gguf_get_meta_size(ctx_out); + for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i)); + total_size += ggml_nbytes(t); + } + total_size = total_size / 1000 / 1000; // convert to megabytes + printf("split %05d: n_tensors = %" PRIi64 ", total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + i_split++; + } + } + + void write() { + int i_split = 0; + int n_split = ctx_outs.size(); + for (auto & ctx_out : ctx_outs) { + // construct file path + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); + + // open the output file + printf("Writing file %s ... ", split_path); + fflush(stdout); + std::ofstream fout = std::ofstream(split_path, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + // write metadata + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + // write tensors + for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { + // read tensor meta and prepare buffer + const char * t_name = gguf_get_tensor_name(ctx_out, i); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + auto n_bytes = ggml_nbytes(t); + read_buf.resize(n_bytes); + + // calculate offset + auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + + // copy tensor from input to output file + copy_file_to_file(f_input, fout, offset, n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + + printf("done\n"); + // close the file + fout.close(); + i_split++; + } + } + + void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) { + // TODO: detect OS and use copy_file_range() here for better performance + if (read_buf.size() < len) { + read_buf.resize(len); + } + f_in.seekg(in_offset); + f_in.read((char *)read_buf.data(), len); + f_out.write((const char *)read_buf.data(), len); + } +}; + +static void gguf_split(const split_params & split_params) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + std::ifstream f_input(split_params.input.c_str(), std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(EXIT_FAILURE); + } + + auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(EXIT_FAILURE); + } + + // prepare the strategy + split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); + int n_split = strategy.ctx_outs.size(); + strategy.print_info(); + + if (!split_params.dry_run) { + // write all output splits + strategy.write(); + } + + // done, clean up + gguf_free(ctx_gguf); + f_input.close(); + + fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", + __func__, n_split, strategy.n_tensors); +} + +static void gguf_merge(const split_params & split_params) { + fprintf(stderr, "%s: %s -> %s\n", + __func__, split_params.input.c_str(), + split_params.output.c_str()); + int n_split = 1; + int total_tensors = 0; + + // avoid overwriting existing output file + if (std::ifstream(split_params.output.c_str())) { + fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str()); + exit(EXIT_FAILURE); + } + + + auto * ctx_out = gguf_init_empty(); + + std::vector read_data; + std::vector ctx_metas; + std::vector ctx_ggufs; + + char split_path[PATH_MAX] = {0}; + strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1); + char split_prefix[PATH_MAX] = {0}; + + // First pass to find KV and tensors metadata + for (int i_split = 0; i_split < n_split; i_split++) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + if (i_split > 0) { + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + } + fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path); + + auto * ctx_gguf = gguf_init_from_file(split_path, params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(EXIT_FAILURE); + } + ctx_ggufs.push_back(ctx_gguf); + ctx_metas.push_back(ctx_meta); + + if (i_split == 0) { + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); + if (key_n_split < 0) { + fprintf(stderr, + "\n%s: input file does not contain %s metadata\n", + __func__, + LLM_KV_SPLIT_COUNT); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + gguf_free(ctx_out); + exit(EXIT_FAILURE); + } + + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); + if (n_split < 1) { + fprintf(stderr, + "\n%s: input file does not contain a valid split count %d\n", + __func__, + n_split); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + gguf_free(ctx_out); + exit(EXIT_FAILURE); + } + + // Verify the file naming and extract split_prefix + if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { + fprintf(stderr, "\n%s: unexpected input file name: %s" + " i_split=%d" + " n_split=%d\n", __func__, + split_path, i_split, n_split); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + gguf_free(ctx_out); + exit(EXIT_FAILURE); + } + + // Do not trigger merge if we try to merge again the output + gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0); + + // Set metadata from the first split + gguf_set_kv(ctx_out, ctx_gguf); + } + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + gguf_add_tensor(ctx_out, t); + } + total_tensors += n_tensors; + + fprintf(stderr, "\033[3Ddone\n"); + } + std::ofstream fout; + if (!split_params.dry_run) { + fout.open(split_params.output.c_str(), std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + // placeholder for the meta data + auto meta_size = gguf_get_meta_size(ctx_out); + ::zeros(fout, meta_size); + } + + // Write tensors data + for (int i_split = 0; i_split < n_split; i_split++) { + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + std::ifstream f_input(split_path, std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path); + for (uint32_t i = 0; i < ctx_ggufs.size(); i++) { + gguf_free(ctx_ggufs[i]); + ggml_free(ctx_metas[i]); + } + gguf_free(ctx_out); + if (!split_params.dry_run) { + fout.close(); + } + exit(EXIT_FAILURE); + } + fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path); + + auto * ctx_gguf = ctx_ggufs[i_split]; + auto * ctx_meta = ctx_metas[i_split]; + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + + auto n_bytes = ggml_nbytes(t); + + if (read_data.size() < n_bytes) { + read_data.resize(n_bytes); + } + + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); + if (!split_params.dry_run) { + // write tensor data + padding + fout.write((const char *)read_data.data(), n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + } + + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + f_input.close(); + fprintf(stderr, "\033[3Ddone\n"); + } + + if (!split_params.dry_run) { + // go back to beginning of file and write the updated metadata + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + fout.close(); + } + gguf_free(ctx_out); + + fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n", + __func__, split_params.output.c_str(), n_split, total_tensors); +} + +int main(int argc, const char ** argv) { + split_params params; + split_params_parse(argc, argv, params); + + switch (params.operation) { + case OP_SPLIT: gguf_split(params); + break; + case OP_MERGE: gguf_merge(params); + break; + default: split_print_usage(argv[0]); + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/llama.cpp/tools/gguf-split/tests.sh b/llama.cpp/tools/gguf-split/tests.sh new file mode 100755 index 0000000..c8dd0b0 --- /dev/null +++ b/llama.cpp/tools/gguf-split/tests.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash + +set -eu + +if [ $# -lt 1 ] +then + echo "usage: $0 path_to_build_binary [path_to_temp_folder]" + echo "example: $0 ../../build/bin ../../tmp" + exit 1 +fi + +if [ $# -gt 1 ] +then + TMP_DIR=$2 +else + TMP_DIR=/tmp +fi + +set -x + +SPLIT=$1/llama-gguf-split +MAIN=$1/llama-completion +WORK_PATH=$TMP_DIR/gguf-split +ROOT_DIR=$(realpath $(dirname $0)/../../) + +mkdir -p "$WORK_PATH" + +# Clean up in case of previously failed test +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf + +# 1. Get a model +( +cd $WORK_PATH +"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/Qwen3-0.6B-GGUF --file Qwen3-0.6B-Q8_0.gguf +) +echo PASS + +# 2. Split with max tensors strategy +$SPLIT --split-max-tensors 28 $WORK_PATH/Qwen3-0.6B-Q8_0.gguf $WORK_PATH/ggml-model-split +echo PASS +echo + +# 2b. Test the sharded model is loading properly +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00012.gguf -p "I believe the meaning of life is" --n-predict 32 +echo PASS +echo + +# 3. Merge +$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00012.gguf $WORK_PATH/ggml-model-merge.gguf +echo PASS +echo + +# 3b. Test the merged model is loading properly +$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf -p "I believe the meaning of life is" --n-predict 32 +echo PASS +echo + +# 4. Split with no tensors in the first split +$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors +echo PASS +echo + +# 4b. Test the sharded model is loading properly +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00011.gguf -p "I believe the meaning of life is" --n-predict 32 +echo PASS +echo + +# 5. Merge +#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00012.gguf $WORK_PATH/ggml-model-merge-2.gguf +#echo PASS +#echo + +# 5b. Test the merged model is loading properly +#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 +#echo PASS +#echo + +# 6. Split with size strategy +$SPLIT --split-max-size 500M $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-500M +echo PASS +echo + +# 6b. Test the sharded model is loading properly +$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-500M-00001-of-00002.gguf -p "I believe the meaning of life is" --n-predict 32 +echo PASS +echo + +# Clean up +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf diff --git a/llama.cpp/tools/imatrix/CMakeLists.txt b/llama.cpp/tools/imatrix/CMakeLists.txt new file mode 100644 index 0000000..5af6263 --- /dev/null +++ b/llama.cpp/tools/imatrix/CMakeLists.txt @@ -0,0 +1,13 @@ +set(TARGET llama-imatrix) +add_executable(${TARGET} imatrix.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() + +if (CMAKE_SYSTEM_NAME MATCHES "AIX") + # AIX's flock() function comes from libbsd.a + target_link_libraries(${TARGET} PRIVATE -lbsd) +endif() diff --git a/llama.cpp/tools/imatrix/README.md b/llama.cpp/tools/imatrix/README.md new file mode 100644 index 0000000..4505cb4 --- /dev/null +++ b/llama.cpp/tools/imatrix/README.md @@ -0,0 +1,98 @@ +# llama.cpp/tools/imatrix + +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models. +More information is available in . + +## Usage + +``` +./llama-imatrix \ + -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \ + [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \ + [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \ + [--show-statistics] [...] +``` + +Here `-m | --model` with a model name and `-f | --file` with a file containing calibration data (such as e.g. `wiki.train.raw`) are mandatory. +The parameters in square brackets are optional and have the following meaning: + +* `-h | --help` shows usage information and exits. +* `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. +* `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used. +* `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf". +* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) +* `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. +* `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets. +* `--parse-special` enables parsing of special tokens (e.g., `<|im_start|>` in some models). Useful for models with custom tokenizers. +* `--chunk | --from-chunk` to skip the first `n` chunks of tokens from the input data. Useful for resuming or skipping initial low-quality data. +* `--chunks` maximum number of chunks to process. Default is -1 for all available chunks. +* `--no-ppl` disables the calculation of perplexity for the processed chunks. Useful if you want to speed up the processing and do not care about perplexity. +* `--show-statistics` displays imatrix file's statistics. + +For faster computation, make sure to use GPU offloading via the `-ngl | --n-gpu-layers` argument. + +Recent versions of `llama-imatrix` store data in GGUF format by default. For the legacy format, use an extension other than `.gguf` when saving the output file. More information is available in . + +## Examples + +```bash +# generate importance matrix using default filename (imatrix.gguf), offloading 99 layers to GPU +./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -ngl 99 + +# use the imatrix to perform a Q4_K_M quantization +./llama-quantize --imatrix imatrix.gguf ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m +``` + +```bash +# generate and save the imatrix using legacy format +./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --output-format dat -o imatrix-legcy-format.dat -ngl 99 +``` + +```bash +# convert legacy (binary) imatrix format to new (GGUF) format +./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf +``` + +```bash +# convert new (GGUF) imatrix format to legacy (binary) format +./llama-imatrix --in-file imatrix-new-format.gguf --output-format dat -o imatrix-legacy-format.dat +``` + +```bash +# combine existing imatrices +./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf +``` + +```bash +# skip first 5 chunks, save intermediates every 20 chunks and snapshots every 50, parsing special tokens +./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --chunk 5 --output-frequency 20 --save-frequency 50 --parse-special +``` + +```bash +# analyse imatrix file and display summary statistics instead of running inference +./llama-imatrix --in-file imatrix.gguf --show-statistics +``` + +`--show-statistics` will display the following statistics: + +#### Per tensor + +* Σ(Act²): sum of all squared activations (the importance scores) +* Min & Max: minimum and maximum squared activations values +* μ & σ: Squared activations' mean and standard deviation +* % Active: proportion of elements whose average squared activation exceeds a small threshold (1e-5). Helpful to determine how alive/dormant the tensor is during inference +* N: number of squared activations +* Entropy: entropy of the squared activation distribution, in bits (standard Shannon entropy measurement) $S = -\sum_{i=1}^N p_i \log_2 p_i$ +* E (norm): Normalized entropy. $E(norm)=\frac{-\sum_{i=1}^N p_i \log_2 p_i}{log_2 N}$. These two metrics can be used to determine how well a prompt "exercises" the model's capabilities +* ZD Score: z-score distribution as described in _3.1 Layer Importance Scores_ of [Layer-Wise Quantization](https://arxiv.org/abs/2406.17415) +* CosSim: cosine similarity with respect to the previous layer's tensor. Useful to determine how similar the squared activations of the current layer are to the previous layer's squared activations. + +#### Per layer + +Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated. + +#### Important note on the computed Statistics + +When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**. +Whilst the results are still useful, they're less realiable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors. diff --git a/llama.cpp/tools/imatrix/imatrix.cpp b/llama.cpp/tools/imatrix/imatrix.cpp new file mode 100644 index 0000000..669de55 --- /dev/null +++ b/llama.cpp/tools/imatrix/imatrix.cpp @@ -0,0 +1,1302 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" +#include "gguf.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static void print_usage(int, char ** argv) { + LOG("\nexample usage:\n"); + LOG("\n %s \\\n" + " -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n" + " [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n" + " [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n" + " [--show-statistics] [...]\n" , argv[0]); + LOG("\n"); +} + +static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets"; +static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count"; +static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size"; + +struct Stats { + std::vector values; + std::vector counts; +}; + +struct tensor_statistics { + std::string tensor; + Stats stats; + float total_sqract = 0.0f; + float mean_sqract = 0.0f; + float max_sqract = 0.0f; + float min_sqract = 0.0f; + int elements = 0; + float stddev = 0.0f; + float active = 0.0f; + float entropy = 0.0f; + float zd = 0.0f; + float cossim = 0.0f; +}; + +class IMatrixCollector { +public: + IMatrixCollector() = default; + void set_params(common_params params) { m_params = std::move(params); } + bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); + void save_imatrix_legacy(int32_t ncall = -1) const; + void save_imatrix(int32_t n_chunk = -1) const; + bool load_imatrix_legacy(const char * fname); + bool load_imatrix(const char * file_name); + const std::unordered_map & get_mstats() const { return m_stats; } +private: + std::unordered_map m_stats; + common_params m_params; + std::mutex m_mutex; + std::vector m_datasets; + int32_t m_last_chunk = 0; + std::vector m_src1_data; + std::vector m_ids; // the expert ids from ggml_mul_mat_id +}; + +// remove any prefix and suffixes from the name +// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight +static std::string filter_tensor_name(const char * name) { + std::string wname; + const char * p = strchr(name, '#'); + if (p != NULL) { + p = p + 1; + const char * q = strchr(p, '#'); + if (q != NULL) { + wname = std::string(p, q - p); + } else { + wname = p; + } + } else { + wname = name; + } + return wname; +} + +static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) { + std::vector name; + std::istringstream stream(input); + std::string item; + + while (std::getline(stream, item, '.')) { + name.push_back(item); + } + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == "blk" && i + 1 < name.size()) { + layer = name[i + 1]; + break; + } + } + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == "weight" && i > 0) { + tensor = name[i - 1]; + break; + } + } + + if (tensor.empty()) { + tensor = input; + } + if (layer.empty()) { + layer = "-"; + } +} + +static void compute_statistics(std::vector & tstats, const std::string & name, const Stats & e) { + if (e.values.size() % e.counts.size() != 0) { + LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size()); + return; + } + if (e.counts.empty()) { + LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str()); + return; + } + + const int n_mat = e.counts.size(); + const int row_size = e.values.size() / n_mat; + + std::vector activations; + activations.reserve(e.values.size()); + + for (int i = 0; i < n_mat; ++i) { + for (int j = 0; j < row_size; ++j) { + activations.push_back(e.values[i*row_size + j] / e.counts[i]); + } + } + + const float act_total = std::accumulate(activations.begin(), activations.end(), 0.0f); + const float act_max = *std::max_element(activations.begin(), activations.end()); + const float act_min = *std::min_element(activations.begin(), activations.end()); + const float act_mean = act_total / activations.size(); + const float act_sqr_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f); + const float act_var = (act_sqr_total / activations.size()) - (act_mean * act_mean); + const float act_dev = std::sqrt(std::max(0.0f, act_var)); + float threshold = 1e-5f; + const int inactive_count = std::count_if(activations.begin(), activations.end(), + [threshold](const float v) { return fabsf(v) <= threshold; }); + const float active_ratio = 1 - static_cast(inactive_count) / activations.size(); + + float entropy = 0; + if (act_total > 0) { + for (const auto act : activations) { + if (const float p = act / act_total; p > 0) { + entropy -= p * std::log2(p); + } + } + } + + int z_score = 0; + if (act_dev > 0.0f) { + for (const auto act : activations) { + if (const float p = (act - act_mean) / act_dev; p > 1) { + z_score++; + } + } + } + + auto & ts = tstats.emplace_back(); + ts.tensor = name; + ts.stats = e; + ts.total_sqract = act_total; + ts.mean_sqract = act_mean; + ts.max_sqract = act_max; + ts.min_sqract = act_min; + ts.elements = static_cast(activations.size()); + ts.stddev = act_dev; + ts.active = active_ratio; + ts.entropy = entropy; + ts.zd = static_cast(z_score) / ts.elements; +} + +static void compute_cossim(std::vector & tstats) { + static const std::regex pattern(R"(blk\.(\d+)\.)"); + for (auto & ts : tstats) { + if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) { + const int blk = std::stoi(match[1]); + std::string tname(ts.tensor); + tname.replace(match.position(1), match.length(1), std::to_string(blk-1)); + auto prev = std::find_if(tstats.begin(), tstats.end(), + [tname](const tensor_statistics & t) { return t.tensor == tname; }); + if (prev != tstats.end()) { + const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), + prev->stats.values.begin(), 0.0f); + const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(), + ts.stats.values.begin(), 0.0f)); + const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(), + prev->stats.values.begin(), 0.0f)); + const float cs = dp / (curr_mag * prev_mag); + ts.cossim = cs; + } + } else { + ts.cossim = 0; + } + } +} + +bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + GGML_UNUSED(user_data); + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + std::string wname = filter_tensor_name(src0->name); + + const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel; + + // when ask is true, the scheduler wants to know if we are interested in data from this tensor + // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection + if (ask) { + if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications + if (t->op != GGML_OP_MUL_MAT) return false; + // why are small batches ignored (<16 tokens)? + if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; + if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false; + return true; + } + + std::lock_guard lock(m_mutex); + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(src1->buffer); + + if (!is_host) { + const size_t src1_nbytes = ggml_nbytes(src1); + m_src1_data.resize(src1_nbytes); + ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes); + } + + const char * data = is_host ? (const char *) src1->data : m_src1_data.data(); + GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); + + // this has been adapted to the new format of storing merged experts in a single 3d tensor + // ref: https://github.com/ggml-org/llama.cpp/pull/6387 + if (t->op == GGML_OP_MUL_MAT_ID) { + // ids -> [n_experts_used, n_tokens] + // src1 -> [cols, n_expert_used, n_tokens] + const ggml_tensor * ids = t->src[2]; + const int64_t n_as = src0->ne[2]; + const int64_t n_ids = ids->ne[0]; + + // the top-k selected expert ids are stored in the ids tensor + // for simplicity, always copy ids to host, because it is small + // take into account that ids is not contiguous! + + GGML_ASSERT(ids->ne[1] == src1->ne[2]); + + // the extra dimension would need to be stored somewhere to be reflected in the imatrix file + if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { + LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); + GGML_ASSERT(false); + } + + m_ids.resize(ggml_nbytes(ids)); + ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); + + auto & e = m_stats[wname]; + + if (e.counts.size() == 1 && n_as > 1) { + // broadcast, when loading an old imatrix + e.counts.resize(n_as, e.counts[0]); + } + if (e.values.empty()) { + e.values.resize(src1->ne[0]*n_as, 0); + e.counts.resize(n_as, 0); + } + else if (e.values.size() != (size_t)src1->ne[0]*n_as) { + LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0]*n_as)); + exit(1); //GGML_ABORT("fatal error"); + } + else if (e.counts.size() != (size_t)n_as) { + LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as); + exit(1); //GGML_ABORT("fatal error"); + } + LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); + // loop over all possible experts, regardless if they are used or not in the batch + for (int64_t ex = 0; ex < n_as; ++ex) { + size_t e_start = ex*src1->ne[0]; + + for (int64_t idx = 0; idx < n_ids; ++idx) { + for (int64_t row = 0; row < src1->ne[2]; ++row) { + const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]); + + GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check + + if (excur != ex) continue; + + const int64_t i11 = idx % src1->ne[1]; + const int64_t i12 = row; + const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]); + + e.counts[ex]++; + + for (int64_t j = 0; j < src1->ne[0]; ++j) { + e.values[e_start + j] += x[j] * x[j]; + if (!std::isfinite((float)e.values[e_start + j])) { + LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str()); + exit(1); + } + } + } + } + const int32_t n_chunk = e.counts[ex] / chunk_size; + if (n_chunk > m_last_chunk) { + const int32_t chunk_step = n_chunk - m_last_chunk; + m_last_chunk = n_chunk; + if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { + save_imatrix(); + } + if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { + save_imatrix(m_last_chunk); + } + } + } + } else { + auto & e = m_stats[wname]; + const int64_t n_mat = src0->ne[2] * src0->ne[3]; + + // use a single count per dense tensor + // (necessary when merging older GGUF-imatrix files with 3d tensors) + if (e.counts.size() > 1) { + bool all_equal = true; + for (size_t i = 1; i < e.counts.size(); ++i) { + if (e.counts[0] != e.counts[i]) { + all_equal = false; + break; + } + } + if (all_equal) { + e.counts.resize(1); + } + } + if (e.values.empty()) { + e.values.resize(src1->ne[0] * n_mat, 0); + e.counts.resize(1, 0); + } + else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) { + LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat)); + exit(1); //GGML_ABORT("fatal error"); + } + LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type); + + for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) { + for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) { + // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D + const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]); + const int64_t mat_start = mat_id * src1->ne[0]; + + for (int64_t row = 0; row < src1->ne[1]; ++row) { + const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]); + for (int64_t j = 0; j < src1->ne[0]; ++j) { + e.values[mat_start + j] += x[j] * x[j]; + if (!std::isfinite((float)e.values[j])) { + LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str()); + exit(1); + } + } + } + } + } + // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT + for (size_t i = 0; i < e.counts.size(); ++i) { + e.counts[i] += ggml_nrows(src1) / n_mat; + const int32_t n_chunk = e.counts[i] / chunk_size; + if (n_chunk > m_last_chunk) { + const int32_t chunk_step = n_chunk - m_last_chunk; + m_last_chunk = n_chunk; + if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { + save_imatrix(); + } + if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { + save_imatrix(m_last_chunk); + } + } + } + } + + return true; +} + +void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const { + auto fname = m_params.out_file; + + if (ncall > 0) { + fname += ".at_"; + fname += std::to_string(ncall); + } + + // warn when writing imatrix entries that do not have full data + // this can happen with MoE models where some of the experts end up not being exercised by the provided training data + + int n_entries = 0; + std::vector to_store; + + bool is_first = true; // for printing + for (const auto & kv : m_stats) { + const int n_all = kv.second.counts.size(); + + if (n_all == 0) { + continue; + } + + int n_zeros = 0; + for (const int c : kv.second.counts) { + if (c == 0) { + n_zeros++; + } + } + + if (n_zeros != 0 && is_first) { + LOG_INF("\n"); + is_first = false; + } + + if (n_zeros == n_all) { + LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); + continue; + } + + if (n_zeros > 0) { + LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); + } + + n_entries++; + to_store.push_back(kv.first); + } + + if (to_store.size() < m_stats.size()) { + LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); + } + + // deterministic tensor name order + std::sort(to_store.begin(), to_store.end()); + + const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel; + + std::ofstream out(fname, std::ios::binary); + out.write((const char *) &n_entries, sizeof(n_entries)); + for (const auto & name : to_store) { + const auto & stat = m_stats.at(name); + const int32_t len = name.size(); + out.write((const char *) &len, sizeof(len)); + out.write(name.c_str(), len); + // ceiling division to avoid accidental zeros + const int32_t ncall = (*std::max_element(stat.counts.begin(), stat.counts.end()) + (chunk_size - 1)) / chunk_size; + out.write((const char *) &ncall, sizeof(ncall)); + const int32_t nval = stat.values.size(); + const int32_t nmat = stat.counts.size(); + out.write((const char *) &nval, sizeof(nval)); + if (nval > 0 && nmat > 0) { + std::vector tmp(nval); + for (int32_t i = 0; i < nval; i++) { + float count = static_cast(stat.counts[i / (nval / nmat)]); + float value = stat.values[i]; + if (count == 0.0f) { + // store 1 for partial data + value = 1.0f; + count = 1.0f; + } + tmp[i] = (value / count) * static_cast(ncall); + } + out.write((const char *) tmp.data(), nval * sizeof(float)); + } + } + + // Write the number of call the matrix was computed with + out.write((const char *) &m_last_chunk, sizeof(m_last_chunk)); + + // Write the input filename at the end of the file to later on specify it in quantize + { + const char * dataset_file = m_params.prompt_file.c_str(); + int32_t len = m_params.prompt_file.size(); + // When there is no prompt but there were other imatrix files loaded, use the last dataset + if (m_params.prompt_file.empty() && !m_datasets.empty()) { + const std::string & dataset_str = m_datasets[m_datasets.size() - 1]; + dataset_file = dataset_str.c_str(); + len = dataset_str.size(); + } + out.write((const char *) &len, sizeof(len)); + out.write(dataset_file, len); + } + + LOGV(1, "\n"); + LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str()); +} + +void IMatrixCollector::save_imatrix(int32_t n_chunk) const { + auto fname = m_params.out_file; + int8_t use_legacy_format = m_params.imat_dat; + + if (use_legacy_format > 0) { + this->save_imatrix_legacy(n_chunk); + return; + } + // only warn when `--output-format gguf` is not specified + if (use_legacy_format == 0 && !string_ends_with(fname, ".gguf")) { + LOG_WRN("\n%s: saving imatrix using GGUF format with a different suffix than .gguf\n", __func__); + LOG_WRN("%s: if you want the previous imatrix format, use --output-format dat\n", __func__); + } + + if (n_chunk > 0) { + fname += ".at_"; + fname += std::to_string(n_chunk); + } + + // write imatrix entries even if they don't have full data. (can be corrected when reading) + // this can happen with MoE models where some of the experts end up not being exercised by the provided training data + + std::vector to_store; + size_t data_size = 0; + + bool is_first = true; // for printing + for (const auto & kv : m_stats) { + const int n_all = kv.second.counts.size(); + + int n_zeros = 0; + for (const auto c : kv.second.counts) { + if (c == 0) { + n_zeros++; + } + } + + if (n_zeros != 0 && is_first) { + LOG_INF("\n"); + is_first = false; + } + + if (n_zeros > 0) { + LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); + } + + to_store.push_back(kv.first); + data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN); + data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN); + } + + // deterministic tensor name order + std::sort(to_store.begin(), to_store.end()); + + struct ggml_init_params params = { + /* .mem_size = */ data_size, + /* .mem_buffer = */ NULL, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(params); + struct gguf_context * ctx_gguf = gguf_init_empty(); + + { + std::vector datasets; + datasets.reserve(m_datasets.size() + 1); + for (size_t i = 0; i < m_datasets.size(); ++i) { + datasets.push_back(m_datasets[i].c_str()); + } + if (!m_params.prompt_file.empty()) { + datasets.push_back(m_params.prompt_file.c_str()); + } + + gguf_set_val_str(ctx_gguf, "general.type", "imatrix"); + // Write the dataset paths + gguf_set_arr_str(ctx_gguf, LLM_KV_IMATRIX_DATASETS, datasets.data(), datasets.size()); + // Write the number of chunks the matrix was computed with + gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk); + gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel); + } + + for (const auto & name : to_store) { + const auto & stat = m_stats.at(name); + const int32_t nval = (int32_t) stat.values.size(); + const int32_t nmat = (int32_t) stat.counts.size(); + if (nval > 0 && nmat > 0) { + struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat); + struct ggml_tensor * counts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat); + ggml_format_name(in_sum2, "%s.in_sum2", name.c_str()); + ggml_format_name(counts, "%s.counts", name.c_str()); + + for (int32_t j = 0; j < nval; ++j) { + ((float *) in_sum2->data)[j] = (float) stat.values[j]; + } + for (int32_t j = 0; j < nmat; ++j) { + ((float *) counts->data)[j] = (float) stat.counts[j]; + } + + gguf_add_tensor(ctx_gguf, in_sum2); + gguf_add_tensor(ctx_gguf, counts); + } + } + + gguf_write_to_file(ctx_gguf, fname.c_str(), false); + + LOGV(1, "\n"); + LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str()); + + gguf_free(ctx_gguf); + ggml_free(ctx); +} + +bool IMatrixCollector::load_imatrix_legacy(const char * fname) { + std::ifstream in(fname, std::ios::binary); + if (!in) { + LOG_ERR("%s: failed to open %s\n", __func__, fname); + return false; + } + int n_entries; + in.read((char *) &n_entries, sizeof(n_entries)); + if (in.fail() || n_entries < 1) { + LOG_ERR("%s: no data in file %s\n", __func__, fname); + return false; + } + // Guess the chunk size because it's not stored in the file + const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel; + + for (int i = 0; i < n_entries; ++i) { + int32_t len = 0; + in.read((char *) &len, sizeof(len)); + std::vector name_as_vec(len + 1); + in.read((char *) name_as_vec.data(), len); + if (in.fail()) { + LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname); + return false; + } + name_as_vec[len] = 0; + std::string name{ name_as_vec.data() }; + auto & e = m_stats[std::move(name)]; + int32_t ncall = 0; + in.read((char *) &ncall, sizeof(ncall)); + int32_t nval = 0; + in.read((char *) &nval, sizeof(nval)); + if (in.fail() || nval < 1) { + LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i); + m_stats = {}; + return false; + } + + if (e.values.empty()) { + e.values.resize(nval, 0.0f); + e.counts.resize(1, 0); + } + + std::vector tmp(nval); + in.read((char *) tmp.data(), nval * sizeof(float)); + if (in.fail()) { + LOG_ERR("%s: failed reading data for entry %d\n", __func__, i); + m_stats = {}; + return false; + } + + // Recreate the state as expected by save_imatrix(), and correct for weighted sum. + for (int i = 0; i < nval; i++) { + e.values[i] += tmp[i] * chunk_size; + } + // The legacy format doesn't distinguish the counts for different experts + for (size_t j = 0; j < e.counts.size(); ++j) { + e.counts[j] += ncall * chunk_size; + } + } + + { + // TODO: extract into its own method; this is also used by the GGUF-based format + // Calculate the last chunk count + int64_t max_count = 0; + for (const auto & stats : m_stats) { + for (int64_t count : stats.second.counts) { + if (count > max_count) { + max_count = count; + } + } + } + m_last_chunk = max_count / (chunk_size); + } + + { + // Read the number of calls the matrix was computed with + int32_t n_calls; + in.read((char *) &n_calls, sizeof(n_calls)); + // ignore it because it's not important + } + + // Read the dataset path to include it when writing to GGUF + if (!in.fail()){ + int32_t len = 0; + in.read((char *) &len, sizeof(len)); + if (!in.fail()) { + std::vector dataset; + dataset.resize(len + 1, 0); + in.read(dataset.data(), len); + if (!in.fail()) { + m_datasets.push_back(dataset.data()); + } + } + } + + return true; +} + +// Using GGUF as the file format, for greater extensibility +bool IMatrixCollector::load_imatrix(const char * file_name) { + struct ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, // the data is needed + /* .ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params); + if (!ctx_gguf) { + return this->load_imatrix_legacy(file_name); + } + const int32_t n_entries = gguf_get_n_tensors(ctx_gguf); + if (n_entries < 1) { + LOG_ERR("%s: no data in file %s\n", __func__, file_name); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS); + if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) { + const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key); + m_datasets.reserve(m_datasets.size() + n); + for (int64_t i = 0; i < n; ++i) { + m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i)); + } + } + + const std::string in_sum2_suffix{ ".in_sum2" }; + const std::string counts_suffix{ ".counts" }; + + // Could re-use m_stats instead, but this allows + // checking for completeness of *each* loaded imatrix file + // and also makes it easier to re-use a similar implementation in quantize.cpp + // Using an ordered map to get a deterministic iteration order. + std::map> sums_counts_for; + + for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string name = cur->name; + + if (name.empty()) { continue; } + + if (string_remove_suffix(name, in_sum2_suffix)) { + // in_sum2 + sums_counts_for[std::move(name)].first = cur; + } else if (string_remove_suffix(name, counts_suffix)) { + // counts + sums_counts_for[std::move(name)].second = cur; + } else { + // ignore other tensors + } + } + + for (const auto & sc : sums_counts_for) { + const std::string & name = sc.first; + const struct ggml_tensor * in_sum2 = sc.second.first; + const struct ggml_tensor * counts = sc.second.second; + + if (!in_sum2 || !counts) { + LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str()); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + auto & e = m_stats[name]; + + int64_t nval = ggml_nelements(in_sum2); + if (e.values.empty()) { + e.values.resize(nval, 0.0f); + } else if ((size_t) nval != e.values.size()) { + LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size()); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + int64_t ncounts = ggml_nelements(counts); + if (e.counts.empty()) { + e.counts.resize(ncounts, 0); + } else if (e.counts.size() == 1 && ncounts > 1) { + // broadcast, when loading an old imatrix + e.counts.resize(ncounts, e.counts[0]); + } else if ((size_t) ncounts != e.counts.size()) { + LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size()); + gguf_free(ctx_gguf); + ggml_free(ctx); + return false; + } + + // Recreate the state as expected by save_imatrix() + for (int64_t j = 0; j < nval; j++) { + e.values[j] += ((const float *) in_sum2->data)[j]; + } + for (int64_t j = 0; j < ncounts; j++) { + e.counts[j] += std::lround(((const float *) counts->data)[j]); + } + } + + // TODO: extract into its own method; this is also used by the legacy format + // Calculate the last chunk count + int64_t max_count = 0; + for (const auto & stats : m_stats) { + for (int64_t count : stats.second.counts) { + if (count > max_count) { + max_count = count; + } + } + } + m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel); + + gguf_free(ctx_gguf); + ggml_free(ctx); + return true; +} + +static IMatrixCollector g_collector; + +static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + return g_collector.collect_imatrix(t, ask, user_data); +} + +struct results_log_softmax { + double log_softmax; + float logit; + float prob; +}; + +static std::vector softmax(const std::vector & logits) { + std::vector probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) { + max_logit = std::max(max_logit, v); + } + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + const float logit = logits[i] - max_logit; + const float exp_logit = expf(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) { + probs[i] /= sum_exp; + } + return probs; +} + +static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { + float max_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) { + max_logit = std::max(max_logit, logits[i]); + } + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) { + sum_exp += expf(logits[i] - max_logit); + } + return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; +} + +static void process_logits( + int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, + double & nll, double & nll2, float * logit_history, float * prob_history) { + std::mutex mutex; + int counter = 0; + auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { + double local_nll = 0; + double local_nll2 = 0; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + nll += local_nll; nll2 += local_nll2; + break; + } + lock.unlock(); + const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const double v = -results.log_softmax; + local_nll += v; + local_nll2 += v*v; + + logit_history[i] = results.logit; + prob_history[i] = results.prob; + } + }; + for (auto & w : workers) { + w = std::thread(compute); + } + compute(); + for (auto & w : workers) { + w.join(); + } +} + +static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const bool add_bos = llama_vocab_get_add_bos(vocab); + + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + + auto tim1 = std::chrono::high_resolution_clock::now(); + LOG_INF("%s: tokenizing the input ..\n", __func__); + + std::vector tokens = common_tokenize(ctx, params.prompt, true, params.parse_special); + + auto tim2 = std::chrono::high_resolution_clock::now(); + LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + + if (params.i_chunk > 0) { + if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { + LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); + return false; + } + LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); + tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); + } + + if (int(tokens.size()) < 2*n_ctx) { + LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx); + LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size()); + return false; + } + + std::vector logit_history; + std::vector prob_history; + + if (params.compute_ppl) { + logit_history.resize(tokens.size()); + prob_history.resize(tokens.size()); + } + + const int n_chunk_max = tokens.size() / n_ctx; + + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_vocab_n_tokens(vocab); + const int n_batch = params.n_batch; + + int count = 0; + double nll = 0.0; + double nll2 = 0.0; + + const int num_batches = (n_ctx + n_batch - 1) / n_batch; + const int n_seq = std::max(1, n_batch / n_ctx); + + GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0); + GGML_ASSERT(params.n_ctx == n_seq * n_ctx); + + llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1); + + std::vector logits; + if (params.compute_ppl && num_batches > 1) { + logits.reserve((size_t)n_ctx * n_vocab); + } + + LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); + + std::vector workers(std::thread::hardware_concurrency() - 1); + + for (int i = 0; i < n_chunk; i += n_seq) { + const int start = i * n_ctx; + const int end = start + n_ctx; + + const int n_seq_batch = std::min(n_seq, n_chunk - i); + + const auto t_start = std::chrono::high_resolution_clock::now(); + + // clear the KV cache + llama_memory_clear(llama_get_memory(ctx), true); + + for (int j = 0; j < num_batches; ++j) { + const int batch_start = start + j * n_batch; + const int batch_size = std::min(end - batch_start, n_batch); + + // clear the batch + common_batch_clear(batch); + + for (int seq = 0; seq < n_seq_batch; seq++) { + int seq_start = batch_start + seq*n_ctx; + + // save original token and restore it after eval + const auto token_org = tokens[seq_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[seq_start] = llama_vocab_bos(vocab); + } + for (int k = 0; k < batch_size; ++k) { + // NOTE: specifying all logits to get activations for the output.weight tensor + // and also for the perplexity calculation. + // TODO: only get outputs when (params.process_output || params.compute_ppl) + // (not possible when this skips FFN computation of the last layer) + common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true); + } + + // restore the original token in case it was set to BOS + tokens[seq_start] = token_org; + } + + if (llama_decode(ctx, batch)) { + LOG_ERR("%s : failed to eval\n", __func__); + llama_batch_free(batch); + return false; + } + + if (params.compute_ppl && num_batches > 1) { + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + } + } + + + if (i == 0) { + llama_synchronize(ctx); + const auto t_end = std::chrono::high_resolution_clock::now(); + const float t_total = std::chrono::duration(t_end - t_start).count(); + LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total); + int total_seconds = (int)(t_total * n_chunk / n_seq); + if (total_seconds >= 60*60) { + LOG("%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + LOG("%.2f minutes\n", total_seconds / 60.0); + } + + if (params.compute_ppl) { + const int first = n_ctx/2; + for (int seq = 0; seq < n_seq_batch; seq++) { + const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx); + + llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first; + + process_logits(n_vocab, all_logits + first*n_vocab, + tokens_data, n_ctx - 1 - first, + workers, nll, nll2, + logit_history.data() + start + seq*n_ctx + first, + prob_history.data() + start + seq*n_ctx + first); + + count += n_ctx - first - 1; + + LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); + } + fflush(stdout); + + logits.clear(); + } + } + + LOG("\n"); + + if (params.compute_ppl) { + nll2 /= count; + nll /= count; + const double ppl = exp(nll); + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + LOG("Unexpected negative standard deviation of log(prob)\n"); + } + } + + llama_batch_free(batch); + + return true; +} + +static bool show_statistics(const common_params & params) { + std::vector ts; + if (params.in_files.empty() || params.in_files.size() > 1) { + LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n"); + return false; + } + if (g_collector.load_imatrix(params.in_files[0].c_str())) { + for (const auto & [name, stats] :g_collector.get_mstats()) { + compute_statistics(ts, name, stats); + } + } else { + LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str()); + return false; + } + if (!ts.empty()) { + compute_cossim(ts); + } else { + LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str()); + return false; + } + + struct tensor_comparer { + bool operator()(const tensor_statistics & a, const tensor_statistics & b) const { + std::string layer, name_a, name_b; + ; + process_tensor_name(a.tensor, layer, name_a); + process_tensor_name(b.tensor, layer, name_b); + return name_a < name_b || (name_a == name_b && a.total_sqract > b.total_sqract); + } + }; + std::sort(ts.begin(), ts.end(), tensor_comparer()); + + struct weighted_stats { + float weighted_bias = 0.0f; + float weighted_zd = 0.0f; + float weighted_cossim = 0.0f; + int total_elements = 0; + }; + std::map ws; + + LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast(ts.size())); + LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", " Tensor", " Σ(Act²)", + " Min", " Max", " μ", " σ", " % Active", "N", " Entropy", "E (norm)", "ZD", + " CosSim"); + LOG_INF( + "==============================================================================================================" + "===========================================================\n"); + for (const auto & tstat : ts) { + std::string layer, name; + process_tensor_name(tstat.tensor, layer, name); + + int blk; + try { + blk = std::stoi(layer); + } catch (const std::exception & e) { + blk = -1; // not a block layer + } + + LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n", + layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract, + tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy, + 100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim); + + const float weighted_bias = tstat.elements * tstat.total_sqract; + const float weighted_zd = tstat.elements * tstat.zd; + const float weighted_cossim = tstat.elements * tstat.cossim; + + if (ws.find(blk) != ws.end()) { + ws[blk].weighted_bias += weighted_bias; + ws[blk].weighted_zd += weighted_zd; + ws[blk].weighted_cossim += weighted_cossim; + ws[blk].total_elements += tstat.elements; + } else { + weighted_stats temp_ws; + temp_ws.weighted_bias = weighted_bias; + temp_ws.weighted_zd = weighted_zd; + temp_ws.weighted_cossim = weighted_cossim; + temp_ws.total_elements = tstat.elements; + ws[blk] = temp_ws; + } + } + + const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; }); + LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers); + LOG_INF("\n%s\t%s\t%s\t%s\n", " Layer", " μΣ(Act²)", " μZD", "μCosSim"); + LOG_INF("================================================\n"); + for (const auto & [first, second] : ws) { + const auto & layer = first; + const auto & stats = second; + + if (stats.total_elements == 0) { + continue; + } + + if (layer >= 0) { + const float bias = stats.weighted_bias / stats.total_elements; + const float zd = stats.weighted_zd / stats.total_elements; + const float cossim = stats.weighted_cossim / stats.total_elements; + + LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim); + } + } + LOG_INF("\n"); + + return true; +} + +int main(int argc, char ** argv) { + common_params params; + + params.out_file = "imatrix.gguf"; + + params.n_ctx = 512; + params.escape = false; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) { + return 1; + } + + if (params.show_statistics) { + if (!show_statistics(params)) { + return 1; + } + return 0; + } + + common_init(); + + const int32_t n_ctx = params.n_ctx; + + if (n_ctx <= 0) { + LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__); + return 1; + } + + { + const int32_t n_seq = std::max(1, params.n_batch / n_ctx); + const int32_t n_kv = n_seq * n_ctx; + + params.n_parallel = n_seq; + params.n_ctx = n_kv; + + params.n_batch = std::min(params.n_batch, n_kv); + } + + g_collector.set_params(params); + + for (const auto & in_file : params.in_files) { + LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); + if (!g_collector.load_imatrix(in_file.c_str())) { + LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str()); + return 1; + } + } + + if (params.prompt.empty()) { + LOG_INF("No prompt provided; combining precomputed matrices only.\n"); + + if (params.in_files.empty()) { + LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n"); + return 1; + } + + if (params.in_files.size() == 1) { + LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str()); + } else if (params.in_files.size() > 1) { + LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); + } + + g_collector.save_imatrix(); + + return 0; + } + + llama_backend_init(); + llama_numa_init(params.numa); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = ik_collect_imatrix; + params.cb_eval_user_data = NULL; + params.warmup = false; + + // init + auto llama_init = common_init_from_params(params); + + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); + + if (model == nullptr || ctx == nullptr) { + LOG_ERR("%s : failed to init\n", __func__); + return 1; + } + + const int n_ctx_train = llama_model_n_ctx_train(model); + if (params.n_ctx > n_ctx_train) { + LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } + + // print system information + { + LOG_INF("\n"); + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); + } + + if (!compute_imatrix(ctx, params, n_ctx)) { + return 1; + } + + g_collector.save_imatrix(); + + LOG("\n"); + llama_perf_context_print(ctx); + + llama_backend_free(); + + return 0; +} diff --git a/llama.cpp/tools/llama-bench/CMakeLists.txt b/llama.cpp/tools/llama-bench/CMakeLists.txt new file mode 100644 index 0000000..b8543a9 --- /dev/null +++ b/llama.cpp/tools/llama-bench/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-bench) +add_executable(${TARGET} llama-bench.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/llama.cpp/tools/llama-bench/README.md b/llama.cpp/tools/llama-bench/README.md new file mode 100644 index 0000000..c837bb6 --- /dev/null +++ b/llama.cpp/tools/llama-bench/README.md @@ -0,0 +1,349 @@ +# llama.cpp/tools/llama-bench + +Performance testing tool for llama.cpp. + +## Table of contents + +1. [Syntax](#syntax) +2. [Examples](#examples) + 1. [Text generation with different models](#text-generation-with-different-models) + 2. [Prompt processing with different batch sizes](#prompt-processing-with-different-batch-sizes) + 3. [Different numbers of threads](#different-numbers-of-threads) + 4. [Different numbers of layers offloaded to the GPU](#different-numbers-of-layers-offloaded-to-the-gpu) +3. [Output formats](#output-formats) + 1. [Markdown](#markdown) + 2. [CSV](#csv) + 3. [JSON](#json) + 4. [JSONL](#jsonl) + 5. [SQL](#sql) + +## Syntax + +``` +usage: llama-bench [options] + +options: + -h, --help + --numa numa mode (default: disabled) + -r, --repetitions number of times to repeat each test (default: 5) + --prio <0|1|2|3> process/thread priority (default: 0) + --delay <0...N> (seconds) delay between each test (default: 0) + -o, --output output format printed to stdout (default: md) + -oe, --output-err output format printed to stderr (default: none) + --list-devices list available devices and exit + -v, --verbose verbose output + --progress print test progress indicators + -rpc, --rpc register RPC devices (comma separated) + +test parameters: + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -pg (default: ) + -d, --n-depth (default: 0) + -b, --batch-size (default: 2048) + -ub, --ubatch-size (default: 512) + -ctk, --cache-type-k (default: f16) + -ctv, --cache-type-v (default: f16) + -t, --threads (default: system dependent) + -C, --cpu-mask (default: 0x0) + --cpu-strict <0|1> (default: 0) + --poll <0...100> (default: 50) + -ngl, --n-gpu-layers (default: 99) + -ncmoe, --n-cpu-moe (default: 0) + -sm, --split-mode (default: layer) + -mg, --main-gpu (default: 0) + -nkvo, --no-kv-offload <0|1> (default: 0) + -fa, --flash-attn <0|1> (default: 0) + -dev, --device (default: auto) + -mmp, --mmap <0|1> (default: 1) + -embd, --embeddings <0|1> (default: 0) + -ts, --tensor-split (default: 0) + -ot --override-tensors =;... + (default: disabled) + -nopo, --no-op-offload <0|1> (default: 0) + +Multiple values can be given for each parameter by separating them with ',' +or by specifying the parameter multiple times. Ranges can be given as +'first-last' or 'first-last+step' or 'first-last*mult'. +``` + +llama-bench can perform three types of tests: + +- Prompt processing (pp): processing a prompt in batches (`-p`) +- Text generation (tg): generating a sequence of tokens (`-n`) +- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`) + +With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`). + +Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition. + +Using the `-d ` option, each test can be run at a specified context depth, prefilling the KV cache with `` tokens. + +For a description of the other options, see the [completion example](../completion/README.md). + +> [!NOTE] +> The measurements with `llama-bench` do not include the times for tokenization and for sampling. + +## Examples + +### Text generation with different models + +```sh +$ ./llama-bench -m models/7B/ggml-model-q4_0.gguf -m models/13B/ggml-model-q4_0.gguf -p 0 -n 128,256,512 +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 132.19 ± 0.55 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 256 | 129.37 ± 0.54 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 512 | 123.83 ± 0.25 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 128 | 82.17 ± 0.31 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 256 | 80.74 ± 0.23 | +| llama 13B mostly Q4_0 | 6.86 GiB | 13.02 B | CUDA | 99 | tg 512 | 78.08 ± 0.07 | + +### Prompt processing with different batch sizes + +```sh +$ ./llama-bench -n 0 -p 1024 -b 128,256,512,1024 +``` + +| model | size | params | backend | ngl | n_batch | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 128 | pp 1024 | 1436.51 ± 3.66 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 256 | pp 1024 | 1932.43 ± 23.48 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 512 | pp 1024 | 2254.45 ± 15.59 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | 1024 | pp 1024 | 2498.61 ± 13.58 | + +### Different numbers of threads + +```sh +$ ./llama-bench -n 0 -n 16 -p 64 -t 1,2,4,8,16,32 +``` + +| model | size | params | backend | threads | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ---------: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | pp 64 | 6.17 ± 0.07 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 1 | tg 16 | 4.05 ± 0.02 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | pp 64 | 12.31 ± 0.13 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 2 | tg 16 | 7.80 ± 0.07 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | pp 64 | 23.18 ± 0.06 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 4 | tg 16 | 12.22 ± 0.07 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | pp 64 | 32.29 ± 1.21 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 8 | tg 16 | 16.71 ± 0.66 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | pp 64 | 33.52 ± 0.03 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 16 | tg 16 | 15.32 ± 0.05 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | pp 64 | 59.00 ± 1.11 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CPU | 32 | tg 16 | 16.41 ± 0.79 | + +### Different numbers of layers offloaded to the GPU + +```sh +$ ./llama-bench -ngl 10,20,30,31,32,33,34,35 +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | pp 512 | 373.36 ± 2.25 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 10 | tg 128 | 13.45 ± 0.93 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | pp 512 | 472.65 ± 1.25 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 20 | tg 128 | 21.36 ± 1.94 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | pp 512 | 631.87 ± 11.25 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 30 | tg 128 | 40.04 ± 1.82 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | pp 512 | 657.89 ± 5.08 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 31 | tg 128 | 48.19 ± 0.81 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | pp 512 | 688.26 ± 3.29 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 32 | tg 128 | 54.78 ± 0.65 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | pp 512 | 704.27 ± 2.24 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 33 | tg 128 | 60.62 ± 1.76 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | pp 512 | 881.34 ± 5.40 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 34 | tg 128 | 71.76 ± 0.23 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | + +### Different prefilled context + +``` +$ ./llama-bench -d 0,512 +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 | + +## Output formats + +By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option. + +### Markdown + +```sh +$ ./llama-bench -o md +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ---------- | ---------------: | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | pp 512 | 2368.80 ± 93.24 | +| llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 99 | tg 128 | 131.42 ± 0.59 | + +### CSV + +```sh +$ ./llama-bench -o csv +``` + +```csv +build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434" +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617" +``` + +### JSON + +```sh +$ ./llama-bench -o json +``` + +```json +[ + { + "build_commit": "8cf427ff", + "build_number": 5163, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", + "model_type": "qwen2 7B Q4_K - Medium", + "model_size": 4677120000, + "model_n_params": 7615616512, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", + "n_gpu_layers": 99, + "split_mode": "layer", + "main_gpu": 0, + "no_kv_offload": false, + "flash_attn": false, + "tensor_split": "0.00", + "use_mmap": true, + "embeddings": false, + "n_prompt": 512, + "n_gen": 0, + "n_depth": 0, + "test_time": "2025-04-24T11:58:50Z", + "avg_ns": 72135640, + "stddev_ns": 1453752, + "avg_ts": 7100.002165, + "stddev_ts": 140.341520, + "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ], + "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ] + }, + { + "build_commit": "8cf427ff", + "build_number": 5163, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", + "model_type": "qwen2 7B Q4_K - Medium", + "model_size": 4677120000, + "model_n_params": 7615616512, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", + "n_gpu_layers": 99, + "split_mode": "layer", + "main_gpu": 0, + "no_kv_offload": false, + "flash_attn": false, + "tensor_split": "0.00", + "use_mmap": true, + "embeddings": false, + "n_prompt": 0, + "n_gen": 128, + "n_depth": 0, + "test_time": "2025-04-24T11:58:51Z", + "avg_ns": 1076767880, + "stddev_ns": 9449585, + "avg_ts": 118.881588, + "stddev_ts": 1.041811, + "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ], + "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ] + } +] +``` + + +### JSONL + +```sh +$ ./llama-bench -o jsonl +``` + +```json lines +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]} +``` + + +### SQL + +SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database. + +```sh +$ ./llama-bench -o sql +``` + +```sql +CREATE TABLE IF NOT EXISTS test ( + build_commit TEXT, + build_number INTEGER, + cpu_info TEXT, + gpu_info TEXT, + backends TEXT, + model_filename TEXT, + model_type TEXT, + model_size INTEGER, + model_n_params INTEGER, + n_batch INTEGER, + n_ubatch INTEGER, + n_threads INTEGER, + cpu_mask TEXT, + cpu_strict INTEGER, + poll INTEGER, + type_k TEXT, + type_v TEXT, + n_gpu_layers INTEGER, + split_mode TEXT, + main_gpu INTEGER, + no_kv_offload INTEGER, + flash_attn INTEGER, + tensor_split TEXT, + use_mmap INTEGER, + embeddings INTEGER, + n_prompt INTEGER, + n_gen INTEGER, + n_depth INTEGER, + test_time TEXT, + avg_ns INTEGER, + stddev_ns INTEGER, + avg_ts REAL, + stddev_ts REAL +); + +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647'); +``` diff --git a/llama.cpp/tools/llama-bench/llama-bench.cpp b/llama.cpp/tools/llama-bench/llama-bench.cpp new file mode 100644 index 0000000..7da6c39 --- /dev/null +++ b/llama.cpp/tools/llama-bench/llama-bench.cpp @@ -0,0 +1,2291 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "ggml.h" +#include "llama.h" + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#endif + +// utils +static uint64_t get_time_ns() { + using clock = std::chrono::high_resolution_clock; + return std::chrono::nanoseconds(clock::now().time_since_epoch()).count(); +} + +static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) { + if (a.pattern != b.pattern) { + // cString comparison that may be null + if (a.pattern == nullptr || b.pattern == nullptr) { + return false; + } + if (strcmp(a.pattern, b.pattern) != 0) { + return false; + } + } + if (a.buft != b.buft) { + return false; + } + return true; +} + +static bool vec_tensor_buft_override_equal(const std::vector& a, const std::vector& b) { + if (a.size() != b.size()) { + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (!tensor_buft_override_equal(a[i], b[i])) { + return false; + } + } + return true; +} + +static bool vec_vec_tensor_buft_override_equal(const std::vector>& a, const std::vector>& b) { + if (a.size() != b.size()) { + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (!vec_tensor_buft_override_equal(a[i], b[i])) { + return false; + } + } + return true; +} + +template static std::string join(const std::vector & values, const std::string & delim) { + std::ostringstream str; + for (size_t i = 0; i < values.size(); i++) { + str << values[i]; + if (i < values.size() - 1) { + str << delim; + } + } + return str.str(); +} + +template static std::vector transform_to_str(const std::vector & values, F f) { + std::vector str_values; + std::transform(values.begin(), values.end(), std::back_inserter(str_values), f); + return str_values; +} + +template static T avg(const std::vector & v) { + if (v.empty()) { + return 0; + } + T sum = std::accumulate(v.begin(), v.end(), T(0)); + return sum / (T) v.size(); +} + +template static T stdev(const std::vector & v) { + if (v.size() <= 1) { + return 0; + } + T mean = avg(v); + T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0)); + T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1)); + return stdev; +} + +static std::string get_cpu_info() { + std::vector cpu_list; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) { + cpu_list.push_back(ggml_backend_dev_description(dev)); + } + } + return join(cpu_list, ", "); +} + +static std::string get_gpu_info() { + std::vector gpu_list; + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + auto * dev = ggml_backend_dev_get(i); + auto dev_type = ggml_backend_dev_type(dev); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU || dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) { + gpu_list.push_back(ggml_backend_dev_description(dev)); + } + } + return join(gpu_list, ", "); +} + +static std::vector parse_devices_arg(const std::string & value) { + std::vector devices; + std::string trimmed = string_strip(value); + if (trimmed.empty()) { + throw std::invalid_argument("no devices specified"); + } + if (trimmed == "auto") { + return devices; + } + + auto dev_names = string_split(trimmed, '/'); + if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") { + devices.push_back(nullptr); + return devices; + } + + for (auto & name : dev_names) { + std::string dev_name = string_strip(name); + if (dev_name.empty()) { + throw std::invalid_argument("invalid device specification"); + } + auto * dev = ggml_backend_dev_by_name(dev_name.c_str()); + if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str())); + } + devices.push_back(dev); + } + + devices.push_back(nullptr); + return devices; +} + +static void register_rpc_server_list(const std::string & servers) { + auto rpc_servers = string_split(servers, ','); + if (rpc_servers.empty()) { + throw std::invalid_argument("no RPC servers specified"); + } + + auto * rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + throw std::invalid_argument("failed to find RPC backend"); + } + + using add_rpc_server_fn = ggml_backend_reg_t (*)(const char * endpoint); + auto * ggml_backend_rpc_add_server_fn = (add_rpc_server_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server"); + if (!ggml_backend_rpc_add_server_fn) { + throw std::invalid_argument("failed to find RPC add server function"); + } + for (const auto & server : rpc_servers) { + auto reg = ggml_backend_rpc_add_server_fn(server.c_str()); + ggml_backend_register(reg); + } +} + +static std::string devices_to_string(const std::vector & devices) { + if (devices.empty()) { + return "auto"; + } + + if (devices.size() == 1 && devices[0] == nullptr) { + return "none"; + } + + std::vector names; + for (auto * dev : devices) { + if (dev == nullptr) { + break; + } + names.push_back(ggml_backend_dev_name(dev)); + } + + return join(names, "/"); +} + +// command line params +enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL }; + +static const char * output_format_str(output_formats format) { + switch (format) { + case NONE: + return "none"; + case CSV: + return "csv"; + case JSON: + return "json"; + case JSONL: + return "jsonl"; + case MARKDOWN: + return "md"; + case SQL: + return "sql"; + default: + GGML_ABORT("invalid output format"); + } +} + +static bool output_format_from_str(const std::string & s, output_formats & format) { + if (s == "none") { + format = NONE; + } else if (s == "csv") { + format = CSV; + } else if (s == "json") { + format = JSON; + } else if (s == "jsonl") { + format = JSONL; + } else if (s == "md") { + format = MARKDOWN; + } else if (s == "sql") { + format = SQL; + } else { + return false; + } + return true; +} + +static const char * split_mode_str(llama_split_mode mode) { + switch (mode) { + case LLAMA_SPLIT_MODE_NONE: + return "none"; + case LLAMA_SPLIT_MODE_LAYER: + return "layer"; + case LLAMA_SPLIT_MODE_ROW: + return "row"; + default: + GGML_ABORT("invalid split mode"); + } +} + +static std::string pair_str(const std::pair & p) { + static char buf[32]; + snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second); + return buf; +} + +static std::vector parse_int_range(const std::string & s) { + // first[-last[(+|*)step]] + std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))"); + + std::smatch match; + std::string::const_iterator search_start(s.cbegin()); + std::vector result; + while (std::regex_search(search_start, s.cend(), match, range_regex)) { + int first = std::stoi(match[1]); + int last = match[2].matched ? std::stoi(match[2]) : first; + char op = match[3].matched ? match[3].str()[0] : '+'; + int step = match[4].matched ? std::stoi(match[4]) : 1; + + for (int i = first; i <= last;) { + result.push_back(i); + + int prev_i = i; + + if (op == '+') { + i += step; + } else if (op == '*') { + i *= step; + } else { + throw std::invalid_argument("invalid range format"); + } + + if (i <= prev_i) { + throw std::invalid_argument("invalid range"); + } + } + search_start = match.suffix().first; + } + + if (search_start != s.cend()) { + throw std::invalid_argument("invalid range format"); + } + + return result; +} + +struct cmd_params { + std::vector model; + std::vector n_prompt; + std::vector n_gen; + std::vector> n_pg; + std::vector n_depth; + std::vector n_batch; + std::vector n_ubatch; + std::vector type_k; + std::vector type_v; + std::vector n_threads; + std::vector cpu_mask; + std::vector cpu_strict; + std::vector poll; + std::vector n_gpu_layers; + std::vector n_cpu_moe; + std::vector split_mode; + std::vector main_gpu; + std::vector no_kv_offload; + std::vector flash_attn; + std::vector> devices; + std::vector> tensor_split; + std::vector> tensor_buft_overrides; + std::vector use_mmap; + std::vector use_direct_io; + std::vector embeddings; + std::vector no_op_offload; + std::vector no_host; + ggml_numa_strategy numa; + int reps; + ggml_sched_priority prio; + int delay; + bool verbose; + bool progress; + bool no_warmup; + output_formats output_format; + output_formats output_format_stderr; +}; + +static const cmd_params cmd_params_defaults = { + /* model */ { "models/7B/ggml-model-q4_0.gguf" }, + /* n_prompt */ { 512 }, + /* n_gen */ { 128 }, + /* n_pg */ {}, + /* n_depth */ { 0 }, + /* n_batch */ { 2048 }, + /* n_ubatch */ { 512 }, + /* type_k */ { GGML_TYPE_F16 }, + /* type_v */ { GGML_TYPE_F16 }, + /* n_threads */ { cpu_get_num_math() }, + /* cpu_mask */ { "0x0" }, + /* cpu_strict */ { false }, + /* poll */ { 50 }, + /* n_gpu_layers */ { 99 }, + /* n_cpu_moe */ { 0 }, + /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, + /* main_gpu */ { 0 }, + /* no_kv_offload */ { false }, + /* flash_attn */ { false }, + /* devices */ { {} }, + /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, + /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, + /* use_mmap */ { false }, + /* use_direct_io */ { false }, + /* embeddings */ { false }, + /* no_op_offload */ { false }, + /* no_host */ { false }, + /* numa */ GGML_NUMA_STRATEGY_DISABLED, + /* reps */ 5, + /* prio */ GGML_SCHED_PRIO_NORMAL, + /* delay */ 0, + /* verbose */ false, + /* progress */ false, + /* no_warmup */ false, + /* output_format */ MARKDOWN, + /* output_format_stderr */ NONE, +}; + +static void print_usage(int /* argc */, char ** argv) { + printf("usage: %s [options]\n", argv[0]); + printf("\n"); + printf("options:\n"); + printf(" -h, --help\n"); + printf(" --numa numa mode (default: disabled)\n"); + printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", + cmd_params_defaults.reps); + printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n", + cmd_params_defaults.prio); + printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", + cmd_params_defaults.delay); + printf(" -o, --output output format printed to stdout (default: %s)\n", + output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err output format printed to stderr (default: %s)\n", + output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" --list-devices list available devices and exit\n"); + printf(" -v, --verbose verbose output\n"); + printf(" --progress print test progress indicators\n"); + printf(" --no-warmup skip warmup runs before benchmarking\n"); + if (llama_supports_rpc()) { + printf(" -rpc, --rpc register RPC devices (comma separated)\n"); + } + printf("\n"); + printf("test parameters:\n"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", + join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -pg (default: %s)\n", + join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -d, --n-depth (default: %s)\n", + join(cmd_params_defaults.n_depth, ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", + join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", + join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", + join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", + join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -C, --cpu-mask (default: %s)\n", + join(cmd_params_defaults.cpu_mask, ",").c_str()); + printf(" --cpu-strict <0|1> (default: %s)\n", + join(cmd_params_defaults.cpu_strict, ",").c_str()); + printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", + join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ncmoe, --n-cpu-moe (default: %s)\n", + join(cmd_params_defaults.n_cpu_moe, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", + join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", + join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", + join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -fa, --flash-attn <0|1> (default: %s)\n", + join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -dev, --device (default: auto)\n"); + printf(" -mmp, --mmap <0|1> (default: %s)\n", + join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -dio, --direct-io <0|1> (default: %s)\n", + join(cmd_params_defaults.use_direct_io, ",").c_str()); + printf(" -embd, --embeddings <0|1> (default: %s)\n", + join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -ts, --tensor-split (default: 0)\n"); + printf(" -ot --override-tensor =;...\n"); + printf(" (default: disabled)\n"); + printf(" -nopo, --no-op-offload <0|1> (default: 0)\n"); + printf(" --no-host <0|1> (default: %s)\n", + join(cmd_params_defaults.no_host, ",").c_str()); + printf("\n"); + printf( + "Multiple values can be given for each parameter by separating them with ','\n" + "or by specifying the parameter multiple times. Ranges can be given as\n" + "'first-last' or 'first-last+step' or 'first-last*mult'.\n"); +} + +static ggml_type ggml_type_from_name(const std::string & s) { + if (s == "f16") { + return GGML_TYPE_F16; + } + if (s == "bf16") { + return GGML_TYPE_BF16; + } + if (s == "q8_0") { + return GGML_TYPE_Q8_0; + } + if (s == "q4_0") { + return GGML_TYPE_Q4_0; + } + if (s == "q4_1") { + return GGML_TYPE_Q4_1; + } + if (s == "q5_0") { + return GGML_TYPE_Q5_0; + } + if (s == "q5_1") { + return GGML_TYPE_Q5_1; + } + if (s == "iq4_nl") { + return GGML_TYPE_IQ4_NL; + } + + return GGML_TYPE_COUNT; +} + +static cmd_params parse_cmd_params(int argc, char ** argv) { + cmd_params params; + std::string arg; + bool invalid_param = false; + const std::string arg_prefix = "--"; + const char split_delim = ','; + + params.verbose = cmd_params_defaults.verbose; + params.output_format = cmd_params_defaults.output_format; + params.output_format_stderr = cmd_params_defaults.output_format_stderr; + params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; + params.prio = cmd_params_defaults.prio; + params.delay = cmd_params_defaults.delay; + params.progress = cmd_params_defaults.progress; + params.no_warmup = cmd_params_defaults.no_warmup; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + try { + if (arg == "-h" || arg == "--help") { + print_usage(argc, argv); + exit(0); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.model.insert(params.model.end(), p.begin(), p.end()); + } else if (arg == "-p" || arg == "--n-prompt") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); + } else if (arg == "-n" || arg == "--n-gen") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); + } else if (arg == "-pg") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], ','); + if (p.size() != 2) { + invalid_param = true; + break; + } + params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); + } else if (arg == "-d" || arg == "--n-depth") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_depth.insert(params.n_depth.end(), p.begin(), p.end()); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); + } else if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); + } else if (arg == "-ctk" || arg == "--cache-type-k") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + + std::vector types; + for (const auto & t : p) { + ggml_type gt = ggml_type_from_name(t); + if (gt == GGML_TYPE_COUNT) { + invalid_param = true; + break; + } + types.push_back(gt); + } + if (invalid_param) { + break; + } + params.type_k.insert(params.type_k.end(), types.begin(), types.end()); + } else if (arg == "-ctv" || arg == "--cache-type-v") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + + std::vector types; + for (const auto & t : p) { + ggml_type gt = ggml_type_from_name(t); + if (gt == GGML_TYPE_COUNT) { + invalid_param = true; + break; + } + types.push_back(gt); + } + if (invalid_param) { + break; + } + params.type_v.insert(params.type_v.end(), types.begin(), types.end()); + } else if (arg == "-dev" || arg == "--device") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto combos = string_split(argv[i], split_delim); + for (const auto & combo : combos) { + try { + params.devices.push_back(parse_devices_arg(combo)); + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + invalid_param = true; + break; + } + } + if (invalid_param) { + break; + } + } else if (arg == "--list-devices") { + std::vector devices; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { + devices.push_back(dev); + } + } + printf("Available devices:\n"); + if (devices.empty()) { + printf(" (none)\n"); + } + for (auto * dev : devices) { + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } + exit(0); + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); + } else if (arg == "-C" || arg == "--cpu-mask") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end()); + } else if (arg == "--cpu-strict") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end()); + } else if (arg == "--poll") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.poll.insert(params.poll.end(), p.begin(), p.end()); + } else if (arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end()); + } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { + if (++i >= argc) { + invalid_param = true; + break; + } + try { + register_rpc_server_list(argv[i]); + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + invalid_param = true; + break; + } + } else if (arg == "-sm" || arg == "--split-mode") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + + std::vector modes; + for (const auto & m : p) { + llama_split_mode mode; + if (m == "none") { + mode = LLAMA_SPLIT_MODE_NONE; + } else if (m == "layer") { + mode = LLAMA_SPLIT_MODE_LAYER; + } else if (m == "row") { + mode = LLAMA_SPLIT_MODE_ROW; + } else { + invalid_param = true; + break; + } + modes.push_back(mode); + } + if (invalid_param) { + break; + } + params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); + } else if (arg == "-mg" || arg == "--main-gpu") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.main_gpu = parse_int_range(argv[i]); + } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); + } else if (arg == "--numa") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + if (value == "distribute" || value == "") { + params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; + } else if (value == "isolate") { + params.numa = GGML_NUMA_STRATEGY_ISOLATE; + } else if (value == "numactl") { + params.numa = GGML_NUMA_STRATEGY_NUMACTL; + } else { + invalid_param = true; + break; + } + } else if (arg == "-fa" || arg == "--flash-attn") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); + } else if (arg == "-mmp" || arg == "--mmap") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); + } else if (arg == "-dio" || arg == "--direct-io") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end()); + } else if (arg == "-embd" || arg == "--embeddings") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); + } else if (arg == "-nopo" || arg == "--no-op-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end()); + } else if (arg == "--no-host") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.no_host.insert(params.no_host.end(), p.begin(), p.end()); + } else if (arg == "-ts" || arg == "--tensor-split") { + if (++i >= argc) { + invalid_param = true; + break; + } + for (auto ts : string_split(argv[i], split_delim)) { + // split string by ; and / + const std::regex regex{ R"([;/]+)" }; + std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + GGML_ASSERT(split_arg.size() <= llama_max_devices()); + + std::vector tensor_split(llama_max_devices()); + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + tensor_split[i] = std::stof(split_arg[i]); + } else { + tensor_split[i] = 0.0f; + } + } + params.tensor_split.push_back(tensor_split); + } + } else if (arg == "-ot" || arg == "--override-tensor") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto * value = argv[i]; + /* static */ std::map buft_list; + if (buft_list.empty()) { + // enumerate all the devices and add their buffer types to the list + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + } + auto override_group_span_len = std::strcspn(value, ","); + bool last_group = false; + do { + if (override_group_span_len == 0) { + // Adds an empty override-tensors for an empty span + params.tensor_buft_overrides.push_back({{}}); + if (value[override_group_span_len] == '\0') { + value = &value[override_group_span_len]; + last_group = true; + } else { + value = &value[override_group_span_len + 1]; + override_group_span_len = std::strcspn(value, ","); + } + continue; + } + // Stamps null terminators into the argv + // value for this option to avoid the + // memory leak present in the implementation + // over in arg.cpp. Acceptable because we + // only parse these args once in this program. + auto * override_group = value; + if (value[override_group_span_len] == '\0') { + value = &value[override_group_span_len]; + last_group = true; + } else { + value[override_group_span_len] = '\0'; + value = &value[override_group_span_len + 1]; + } + std::vector group_tensor_buft_overrides{}; + auto override_span_len = std::strcspn(override_group, ";"); + while (override_span_len > 0) { + auto * override = override_group; + if (override_group[override_span_len] != '\0') { + override_group[override_span_len] = '\0'; + override_group = &override_group[override_span_len + 1]; + } else { + override_group = &override_group[override_span_len]; + } + auto tensor_name_span_len = std::strcspn(override, "="); + if (tensor_name_span_len >= override_span_len) { + invalid_param = true; + break; + } + override[tensor_name_span_len] = '\0'; + auto * tensor_name = override; + auto * buffer_type = &override[tensor_name_span_len + 1]; + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("error: unrecognized buffer type '%s'\n", buffer_type); + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + invalid_param = true; + break; + } + group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)}); + override_span_len = std::strcspn(override_group, ";"); + } + if (invalid_param) { + break; + } + group_tensor_buft_overrides.push_back({nullptr,nullptr}); + params.tensor_buft_overrides.push_back(group_tensor_buft_overrides); + override_group_span_len = std::strcspn(value, ","); + } while (!last_group); + } else if (arg == "-r" || arg == "--repetitions") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.reps = std::stoi(argv[i]); + } else if (arg == "--prio") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.prio = (enum ggml_sched_priority) std::stoi(argv[i]); + } else if (arg == "--delay") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.delay = std::stoi(argv[i]); + } else if (arg == "-o" || arg == "--output") { + if (++i >= argc) { + invalid_param = true; + break; + } + invalid_param = !output_format_from_str(argv[i], params.output_format); + } else if (arg == "-oe" || arg == "--output-err") { + if (++i >= argc) { + invalid_param = true; + break; + } + invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); + } else if (arg == "-v" || arg == "--verbose") { + params.verbose = true; + } else if (arg == "--progress") { + params.progress = true; + } else if (arg == "--no-warmup") { + params.no_warmup = true; + } else { + invalid_param = true; + break; + } + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + invalid_param = true; + break; + } + } + + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + print_usage(argc, argv); + exit(1); + } + + // set defaults + if (params.model.empty()) { + params.model = cmd_params_defaults.model; + } + if (params.n_prompt.empty()) { + params.n_prompt = cmd_params_defaults.n_prompt; + } + if (params.n_gen.empty()) { + params.n_gen = cmd_params_defaults.n_gen; + } + if (params.n_pg.empty()) { + params.n_pg = cmd_params_defaults.n_pg; + } + if (params.n_depth.empty()) { + params.n_depth = cmd_params_defaults.n_depth; + } + if (params.n_batch.empty()) { + params.n_batch = cmd_params_defaults.n_batch; + } + if (params.n_ubatch.empty()) { + params.n_ubatch = cmd_params_defaults.n_ubatch; + } + if (params.type_k.empty()) { + params.type_k = cmd_params_defaults.type_k; + } + if (params.type_v.empty()) { + params.type_v = cmd_params_defaults.type_v; + } + if (params.n_gpu_layers.empty()) { + params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; + } + if (params.n_cpu_moe.empty()) { + params.n_cpu_moe = cmd_params_defaults.n_cpu_moe; + } + if (params.split_mode.empty()) { + params.split_mode = cmd_params_defaults.split_mode; + } + if (params.main_gpu.empty()) { + params.main_gpu = cmd_params_defaults.main_gpu; + } + if (params.no_kv_offload.empty()) { + params.no_kv_offload = cmd_params_defaults.no_kv_offload; + } + if (params.flash_attn.empty()) { + params.flash_attn = cmd_params_defaults.flash_attn; + } + if (params.devices.empty()) { + params.devices = cmd_params_defaults.devices; + } + if (params.tensor_split.empty()) { + params.tensor_split = cmd_params_defaults.tensor_split; + } + if (params.tensor_buft_overrides.empty()) { + params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides; + } + if (params.use_mmap.empty()) { + params.use_mmap = cmd_params_defaults.use_mmap; + } + if (params.use_direct_io.empty()) { + params.use_direct_io = cmd_params_defaults.use_direct_io; + } + if (params.embeddings.empty()) { + params.embeddings = cmd_params_defaults.embeddings; + } + if (params.no_op_offload.empty()) { + params.no_op_offload = cmd_params_defaults.no_op_offload; + } + if (params.no_host.empty()) { + params.no_host = cmd_params_defaults.no_host; + } + if (params.n_threads.empty()) { + params.n_threads = cmd_params_defaults.n_threads; + } + if (params.cpu_mask.empty()) { + params.cpu_mask = cmd_params_defaults.cpu_mask; + } + if (params.cpu_strict.empty()) { + params.cpu_strict = cmd_params_defaults.cpu_strict; + } + if (params.poll.empty()) { + params.poll = cmd_params_defaults.poll; + } + + return params; +} + +struct cmd_params_instance { + std::string model; + int n_prompt; + int n_gen; + int n_depth; + int n_batch; + int n_ubatch; + ggml_type type_k; + ggml_type type_v; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + int n_gpu_layers; + int n_cpu_moe; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; + std::vector devices; + std::vector tensor_split; + std::vector tensor_buft_overrides; + bool use_mmap; + bool use_direct_io; + bool embeddings; + bool no_op_offload; + bool no_host; + + llama_model_params to_llama_mparams() const { + llama_model_params mparams = llama_model_default_params(); + + mparams.n_gpu_layers = n_gpu_layers; + if (!devices.empty()) { + mparams.devices = const_cast(devices.data()); + } + mparams.split_mode = split_mode; + mparams.main_gpu = main_gpu; + mparams.tensor_split = tensor_split.data(); + mparams.use_mmap = use_mmap; + mparams.use_direct_io = use_direct_io; + mparams.no_host = no_host; + + if (n_cpu_moe <= 0) { + if (tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = nullptr; + } else { + GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && + "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + } + } else { + static std::vector merged; + static std::vector patterns; + + merged.clear(); + patterns.clear(); + + auto first = tensor_buft_overrides.begin(); + auto last = tensor_buft_overrides.end(); + if (first != last && (last - 1)->pattern == nullptr) { + --last; + } + merged.insert(merged.end(), first, last); + + patterns.reserve((size_t) n_cpu_moe); + merged.reserve(merged.size() + (size_t) n_cpu_moe + 1); + + for (int i = 0; i < n_cpu_moe; ++i) { + patterns.push_back(llm_ffn_exps_block_regex(i)); + merged.push_back({ patterns.back().c_str(), + ggml_backend_cpu_buffer_type() }); + } + + merged.push_back({ nullptr, nullptr }); + + mparams.tensor_buft_overrides = merged.data(); + } + + return mparams; + } + + bool equal_mparams(const cmd_params_instance & other) const { + return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && + split_mode == other.split_mode && + main_gpu == other.main_gpu && tensor_split == other.tensor_split && + use_mmap == other.use_mmap && use_direct_io == other.use_direct_io && + devices == other.devices && + no_host == other.no_host && + vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); + } + + llama_context_params to_llama_cparams() const { + llama_context_params cparams = llama_context_default_params(); + + cparams.n_ctx = n_prompt + n_gen + n_depth; + cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; + cparams.type_k = type_k; + cparams.type_v = type_v; + cparams.offload_kqv = !no_kv_offload; + cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED; + cparams.embeddings = embeddings; + cparams.op_offload = !no_op_offload; + cparams.swa_full = false; + + return cparams; + } +}; + +static std::vector get_cmd_params_instances(const cmd_params & params) { + std::vector instances; + + // this ordering minimizes the number of times that each model needs to be reloaded + // clang-format off + for (const auto & m : params.model) + for (const auto & nl : params.n_gpu_layers) + for (const auto & ncmoe : params.n_cpu_moe) + for (const auto & sm : params.split_mode) + for (const auto & mg : params.main_gpu) + for (const auto & devs : params.devices) + for (const auto & ts : params.tensor_split) + for (const auto & ot : params.tensor_buft_overrides) + for (const auto & mmp : params.use_mmap) + for (const auto & dio : params.use_direct_io) + for (const auto & noh : params.no_host) + for (const auto & embd : params.embeddings) + for (const auto & nopo : params.no_op_offload) + for (const auto & nb : params.n_batch) + for (const auto & nub : params.n_ubatch) + for (const auto & tk : params.type_k) + for (const auto & tv : params.type_v) + for (const auto & nkvo : params.no_kv_offload) + for (const auto & fa : params.flash_attn) + for (const auto & nt : params.n_threads) + for (const auto & cm : params.cpu_mask) + for (const auto & cs : params.cpu_strict) + for (const auto & nd : params.n_depth) + for (const auto & pl : params.poll) { + for (const auto & n_prompt : params.n_prompt) { + if (n_prompt == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ n_prompt, + /* .n_gen = */ 0, + /* .n_depth = */ nd, + /* .n_batch = */ nb, + /* .n_ubatch = */ nub, + /* .type_k = */ tk, + /* .type_v = */ tv, + /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, + /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, + /* .split_mode = */ sm, + /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, + /* .flash_attn = */ fa, + /* .devices = */ devs, + /* .tensor_split = */ ts, + /* .tensor_buft_overrides = */ ot, + /* .use_mmap = */ mmp, + /* .use_direct_io= */ dio, + /* .embeddings = */ embd, + /* .no_op_offload= */ nopo, + /* .no_host = */ noh, + }; + instances.push_back(instance); + } + + for (const auto & n_gen : params.n_gen) { + if (n_gen == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ 0, + /* .n_gen = */ n_gen, + /* .n_depth = */ nd, + /* .n_batch = */ nb, + /* .n_ubatch = */ nub, + /* .type_k = */ tk, + /* .type_v = */ tv, + /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, + /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, + /* .split_mode = */ sm, + /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, + /* .flash_attn = */ fa, + /* .devices = */ devs, + /* .tensor_split = */ ts, + /* .tensor_buft_overrides = */ ot, + /* .use_mmap = */ mmp, + /* .use_direct_io= */ dio, + /* .embeddings = */ embd, + /* .no_op_offload= */ nopo, + /* .no_host = */ noh, + }; + instances.push_back(instance); + } + + for (const auto & n_pg : params.n_pg) { + if (n_pg.first == 0 && n_pg.second == 0) { + continue; + } + cmd_params_instance instance = { + /* .model = */ m, + /* .n_prompt = */ n_pg.first, + /* .n_gen = */ n_pg.second, + /* .n_depth = */ nd, + /* .n_batch = */ nb, + /* .n_ubatch = */ nub, + /* .type_k = */ tk, + /* .type_v = */ tv, + /* .n_threads = */ nt, + /* .cpu_mask = */ cm, + /* .cpu_strict = */ cs, + /* .poll = */ pl, + /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, + /* .split_mode = */ sm, + /* .main_gpu = */ mg, + /* .no_kv_offload= */ nkvo, + /* .flash_attn = */ fa, + /* .devices = */ devs, + /* .tensor_split = */ ts, + /* .tensor_buft_overrides = */ ot, + /* .use_mmap = */ mmp, + /* .use_direct_io= */ dio, + /* .embeddings = */ embd, + /* .no_op_offload= */ nopo, + /* .no_host = */ noh, + }; + instances.push_back(instance); + } + } + // clang-format on + + return instances; +} + +struct test { + static const std::string build_commit; + static const int build_number; + const std::string cpu_info; + const std::string gpu_info; + std::string model_filename; + std::string model_type; + uint64_t model_size; + uint64_t model_n_params; + int n_batch; + int n_ubatch; + int n_threads; + std::string cpu_mask; + bool cpu_strict; + int poll; + ggml_type type_k; + ggml_type type_v; + int n_gpu_layers; + int n_cpu_moe; + llama_split_mode split_mode; + int main_gpu; + bool no_kv_offload; + bool flash_attn; + std::vector devices; + std::vector tensor_split; + std::vector tensor_buft_overrides; + bool use_mmap; + bool use_direct_io; + bool embeddings; + bool no_op_offload; + bool no_host; + int n_prompt; + int n_gen; + int n_depth; + std::string test_time; + std::vector samples_ns; + + test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) : + cpu_info(get_cpu_info()), + gpu_info(get_gpu_info()) { + + model_filename = inst.model; + char buf[128]; + llama_model_desc(lmodel, buf, sizeof(buf)); + model_type = buf; + model_size = llama_model_size(lmodel); + model_n_params = llama_model_n_params(lmodel); + n_batch = inst.n_batch; + n_ubatch = inst.n_ubatch; + n_threads = inst.n_threads; + cpu_mask = inst.cpu_mask; + cpu_strict = inst.cpu_strict; + poll = inst.poll; + type_k = inst.type_k; + type_v = inst.type_v; + n_gpu_layers = inst.n_gpu_layers; + n_cpu_moe = inst.n_cpu_moe; + split_mode = inst.split_mode; + main_gpu = inst.main_gpu; + no_kv_offload = inst.no_kv_offload; + flash_attn = inst.flash_attn; + devices = inst.devices; + tensor_split = inst.tensor_split; + tensor_buft_overrides = inst.tensor_buft_overrides; + use_mmap = inst.use_mmap; + use_direct_io = inst.use_direct_io; + embeddings = inst.embeddings; + no_op_offload = inst.no_op_offload; + no_host = inst.no_host; + n_prompt = inst.n_prompt; + n_gen = inst.n_gen; + n_depth = inst.n_depth; + // RFC 3339 date-time format + time_t t = time(NULL); + std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); + test_time = buf; + + (void) ctx; + } + + uint64_t avg_ns() const { return ::avg(samples_ns); } + + uint64_t stdev_ns() const { return ::stdev(samples_ns); } + + std::vector get_ts() const { + int n_tokens = n_prompt + n_gen; + std::vector ts; + std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), + [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; }); + return ts; + } + + double avg_ts() const { return ::avg(get_ts()); } + + double stdev_ts() const { return ::stdev(get_ts()); } + + static std::string get_backend() { + std::vector backends; + bool rpc_used = false; + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + auto * reg = ggml_backend_reg_get(i); + std::string name = ggml_backend_reg_name(reg); + if (string_starts_with(name, "RPC")) { + if (ggml_backend_reg_dev_count(reg) > 0) { + rpc_used = true; + } + } else { + if (name != "CPU") { + backends.push_back(ggml_backend_reg_name(reg)); + } + } + } + if (rpc_used) { + backends.push_back("RPC"); + } + return backends.empty() ? "CPU" : join(backends, ","); + } + + static const std::vector & get_fields() { + static const std::vector fields = { + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", + "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", + "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", + "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", + "tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings", + "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", + "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" + }; + return fields; + } + + enum field_type { STRING, BOOL, INT, FLOAT }; + + static field_type get_field_type(const std::string & field) { + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || + field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || + field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") { + return INT; + } + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || + field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") { + return BOOL; + } + if (field == "avg_ts" || field == "stddev_ts") { + return FLOAT; + } + return STRING; + } + + std::vector get_values() const { + std::string tensor_split_str; + std::string tensor_buft_overrides_str; + int max_nonzero = 0; + for (size_t i = 0; i < llama_max_devices(); i++) { + if (tensor_split[i] > 0) { + max_nonzero = i; + } + } + for (int i = 0; i <= max_nonzero; i++) { + char buf[32]; + snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]); + tensor_split_str += buf; + if (i < max_nonzero) { + tensor_split_str += "/"; + } + } + if (tensor_buft_overrides.size() == 1) { + // Last element of tensor_buft_overrides is always a null pattern + // so if it is only one element long, it must be a null pattern. + GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr); + tensor_buft_overrides_str += "none"; + } else { + for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) { + // Last element of tensor_buft_overrides is always a null pattern + if (tensor_buft_overrides[i].pattern == nullptr) { + tensor_buft_overrides_str += "none"; + } else { + tensor_buft_overrides_str += tensor_buft_overrides[i].pattern; + tensor_buft_overrides_str += "="; + tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft); + } + if (i + 2 < tensor_buft_overrides.size()) { + tensor_buft_overrides_str += ";"; + } + } + } + std::vector values = { build_commit, + std::to_string(build_number), + cpu_info, + gpu_info, + get_backend(), + model_filename, + model_type, + std::to_string(model_size), + std::to_string(model_n_params), + std::to_string(n_batch), + std::to_string(n_ubatch), + std::to_string(n_threads), + cpu_mask, + std::to_string(cpu_strict), + std::to_string(poll), + ggml_type_name(type_k), + ggml_type_name(type_v), + std::to_string(n_gpu_layers), + std::to_string(n_cpu_moe), + split_mode_str(split_mode), + std::to_string(main_gpu), + std::to_string(no_kv_offload), + std::to_string(flash_attn), + devices_to_string(devices), + tensor_split_str, + tensor_buft_overrides_str, + std::to_string(use_mmap), + std::to_string(use_direct_io), + std::to_string(embeddings), + std::to_string(no_op_offload), + std::to_string(no_host), + std::to_string(n_prompt), + std::to_string(n_gen), + std::to_string(n_depth), + test_time, + std::to_string(avg_ns()), + std::to_string(stdev_ns()), + std::to_string(avg_ts()), + std::to_string(stdev_ts()) }; + return values; + } + + std::map get_map() const { + std::map map; + auto fields = get_fields(); + auto values = get_values(); + std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()), + std::make_pair); + return map; + } +}; + +const std::string test::build_commit = LLAMA_COMMIT; +const int test::build_number = LLAMA_BUILD_NUMBER; + +struct printer { + virtual ~printer() {} + + FILE * fout; + + virtual void print_header(const cmd_params & params) { (void) params; } + + virtual void print_test(const test & t) = 0; + + virtual void print_footer() {} +}; + +struct csv_printer : public printer { + static std::string escape_csv(const std::string & field) { + std::string escaped = "\""; + for (auto c : field) { + if (c == '"') { + escaped += "\""; + } + escaped += c; + } + escaped += "\""; + return escaped; + } + + void print_header(const cmd_params & params) override { + std::vector fields = test::get_fields(); + fprintf(fout, "%s\n", join(fields, ",").c_str()); + (void) params; + } + + void print_test(const test & t) override { + std::vector values = t.get_values(); + std::transform(values.begin(), values.end(), values.begin(), escape_csv); + fprintf(fout, "%s\n", join(values, ",").c_str()); + } +}; + +static std::string escape_json(const std::string & value) { + std::string escaped; + for (auto c : value) { + if (c == '"') { + escaped += "\\\""; + } else if (c == '\\') { + escaped += "\\\\"; + } else if (c <= 0x1f) { + char buf[8]; + snprintf(buf, sizeof(buf), "\\u%04x", c); + escaped += buf; + } else { + escaped += c; + } + } + return escaped; +} + +static std::string format_json_value(const std::string & field, const std::string & value) { + switch (test::get_field_type(field)) { + case test::STRING: + return "\"" + escape_json(value) + "\""; + case test::BOOL: + return value == "0" ? "false" : "true"; + default: + return value; + } +} + +struct json_printer : public printer { + bool first = true; + + void print_header(const cmd_params & params) override { + fprintf(fout, "[\n"); + (void) params; + } + + void print_fields(const std::vector & fields, const std::vector & values) { + assert(fields.size() == values.size()); + for (size_t i = 0; i < fields.size(); i++) { + fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), + format_json_value(fields.at(i), values.at(i)).c_str()); + } + } + + void print_test(const test & t) override { + if (first) { + first = false; + } else { + fprintf(fout, ",\n"); + } + fprintf(fout, " {\n"); + print_fields(test::get_fields(), t.get_values()); + fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str()); + fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str()); + fprintf(fout, " }"); + fflush(fout); + } + + void print_footer() override { fprintf(fout, "\n]\n"); } +}; + +struct jsonl_printer : public printer { + void print_fields(const std::vector & fields, const std::vector & values) { + assert(fields.size() == values.size()); + for (size_t i = 0; i < fields.size(); i++) { + fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str()); + } + } + + void print_test(const test & t) override { + fprintf(fout, "{"); + print_fields(test::get_fields(), t.get_values()); + fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str()); + fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str()); + fprintf(fout, "}\n"); + fflush(fout); + } +}; + +struct markdown_printer : public printer { + std::vector fields; + + static int get_field_width(const std::string & field) { + if (field == "model") { + return -30; + } + if (field == "t/s") { + return 20; + } + if (field == "size" || field == "params") { + return 10; + } + if (field == "n_gpu_layers") { + return 3; + } + if (field == "n_threads") { + return 7; + } + if (field == "n_batch") { + return 7; + } + if (field == "n_ubatch") { + return 8; + } + if (field == "type_k" || field == "type_v") { + return 6; + } + if (field == "split_mode") { + return 5; + } + if (field == "flash_attn") { + return 2; + } + if (field == "devices") { + return -12; + } + if (field == "use_mmap") { + return 4; + } + if (field == "use_direct_io") { + return 3; + } + if (field == "test") { + return 15; + } + if (field == "no_op_offload") { + return 4; + } + if (field == "no_host") { + return 4; + } + + int width = std::max((int) field.length(), 10); + + if (test::get_field_type(field) == test::STRING) { + return -width; + } + return width; + } + + static std::string get_field_display_name(const std::string & field) { + if (field == "n_gpu_layers") { + return "ngl"; + } + if (field == "split_mode") { + return "sm"; + } + if (field == "n_threads") { + return "threads"; + } + if (field == "no_kv_offload") { + return "nkvo"; + } + if (field == "flash_attn") { + return "fa"; + } + if (field == "use_mmap") { + return "mmap"; + } + if (field == "use_direct_io") { + return "dio"; + } + if (field == "embeddings") { + return "embd"; + } + if (field == "no_op_offload") { + return "nopo"; + } + if (field == "no_host") { + return "noh"; + } + if (field == "devices") { + return "dev"; + } + if (field == "tensor_split") { + return "ts"; + } + if (field == "tensor_buft_overrides") { + return "ot"; + } + return field; + } + + void print_header(const cmd_params & params) override { + // select fields to print + fields.emplace_back("model"); + fields.emplace_back("size"); + fields.emplace_back("params"); + fields.emplace_back("backend"); + bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos || + test::get_backend().find("BLAS") != std::string::npos || + test::get_backend().find("ZenDNN") != std::string::npos; + if (!is_cpu_backend) { + fields.emplace_back("n_gpu_layers"); + } + if (params.n_cpu_moe.size() > 1) { + fields.emplace_back("n_cpu_moe"); + } + if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { + fields.emplace_back("n_threads"); + } + if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) { + fields.emplace_back("cpu_mask"); + } + if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) { + fields.emplace_back("cpu_strict"); + } + if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) { + fields.emplace_back("poll"); + } + if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { + fields.emplace_back("n_batch"); + } + if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) { + fields.emplace_back("n_ubatch"); + } + if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { + fields.emplace_back("type_k"); + } + if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { + fields.emplace_back("type_v"); + } + if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { + fields.emplace_back("main_gpu"); + } + if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { + fields.emplace_back("split_mode"); + } + if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { + fields.emplace_back("no_kv_offload"); + } + if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) { + fields.emplace_back("flash_attn"); + } + if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) { + fields.emplace_back("devices"); + } + if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { + fields.emplace_back("tensor_split"); + } + if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) { + fields.emplace_back("tensor_buft_overrides"); + } + if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { + fields.emplace_back("use_mmap"); + } + if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) { + fields.emplace_back("use_direct_io"); + } + if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { + fields.emplace_back("embeddings"); + } + if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) { + fields.emplace_back("no_op_offload"); + } + if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) { + fields.emplace_back("no_host"); + } + fields.emplace_back("test"); + fields.emplace_back("t/s"); + + fprintf(fout, "|"); + for (const auto & field : fields) { + fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str()); + } + fprintf(fout, "\n"); + fprintf(fout, "|"); + for (const auto & field : fields) { + int width = get_field_width(field); + fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-"); + } + fprintf(fout, "\n"); + } + + void print_test(const test & t) override { + std::map vmap = t.get_map(); + + fprintf(fout, "|"); + for (const auto & field : fields) { + std::string value; + char buf[128]; + if (field == "model") { + value = t.model_type; + } else if (field == "size") { + if (t.model_size < 1024 * 1024 * 1024) { + snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0); + } else { + snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0); + } + value = buf; + } else if (field == "params") { + if (t.model_n_params < 1000 * 1000 * 1000) { + snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6); + } else { + snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9); + } + value = buf; + } else if (field == "backend") { + value = test::get_backend(); + } else if (field == "test") { + if (t.n_prompt > 0 && t.n_gen == 0) { + snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); + } else if (t.n_gen > 0 && t.n_prompt == 0) { + snprintf(buf, sizeof(buf), "tg%d", t.n_gen); + } else { + snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); + } + if (t.n_depth > 0) { + int len = strlen(buf); + snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth); + } + value = buf; + } else if (field == "t/s") { + snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); + value = buf; + } else if (vmap.find(field) != vmap.end()) { + value = vmap.at(field); + } else { + assert(false); + exit(1); + } + + int width = get_field_width(field); + if (field == "t/s") { + // HACK: the utf-8 character is 2 bytes + width += 1; + } + fprintf(fout, " %*s |", width, value.c_str()); + } + fprintf(fout, "\n"); + } + + void print_footer() override { + fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number); + } +}; + +struct sql_printer : public printer { + static std::string get_sql_field_type(const std::string & field) { + switch (test::get_field_type(field)) { + case test::STRING: + return "TEXT"; + case test::BOOL: + case test::INT: + return "INTEGER"; + case test::FLOAT: + return "REAL"; + default: + assert(false); + exit(1); + } + } + + void print_header(const cmd_params & params) override { + std::vector fields = test::get_fields(); + fprintf(fout, "CREATE TABLE IF NOT EXISTS llama_bench (\n"); + for (size_t i = 0; i < fields.size(); i++) { + fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(), + i < fields.size() - 1 ? "," : ""); + } + fprintf(fout, ");\n"); + fprintf(fout, "\n"); + (void) params; + } + + void print_test(const test & t) override { + fprintf(fout, "INSERT INTO llama_bench (%s) ", join(test::get_fields(), ", ").c_str()); + fprintf(fout, "VALUES ("); + std::vector values = t.get_values(); + for (size_t i = 0; i < values.size(); i++) { + fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : ""); + } + fprintf(fout, ");\n"); + } +}; + +struct ctx_state { + int depth = 0; // in tokens + + std::vector buf; // the llama_context state buffer +}; + +static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) { + llama_set_n_threads(ctx, n_threads, n_threads); + + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + + std::vector tokens(n_batch); + + int n_processed = 0; + + while (n_processed < n_prompt) { + int n_tokens = std::min(n_prompt - n_processed, n_batch); + tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; + for (int i = 1; i < n_tokens; i++) { + tokens[i] = std::rand() % n_vocab; + } + int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens)); + if (res != 0) { + fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res); + return false; + } + n_processed += n_tokens; + } + + llama_synchronize(ctx); + return true; +} + +static bool test_gen(llama_context * ctx, int n_gen, int n_threads) { + llama_set_n_threads(ctx, n_threads, n_threads); + + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + + llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab; + + for (int i = 0; i < n_gen; i++) { + int res = llama_decode(ctx, llama_batch_get_one(&token, 1)); + if (res != 0) { + fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res); + return false; + } + llama_synchronize(ctx); + token = std::rand() % n_vocab; + } + return true; +} + +static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) text; + (void) user_data; +} + +static std::unique_ptr create_printer(output_formats format) { + switch (format) { + case NONE: + return nullptr; + case CSV: + return std::unique_ptr(new csv_printer()); + case JSON: + return std::unique_ptr(new json_printer()); + case JSONL: + return std::unique_ptr(new jsonl_printer()); + case MARKDOWN: + return std::unique_ptr(new markdown_printer()); + case SQL: + return std::unique_ptr(new sql_printer()); + } + GGML_ABORT("fatal error"); +} + +int main(int argc, char ** argv) { + // try to set locale for unicode characters in markdown + setlocale(LC_CTYPE, ".UTF-8"); + +#if !defined(NDEBUG) + fprintf(stderr, "warning: asserts enabled, performance may be affected\n"); +#endif + +#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__)) + fprintf(stderr, "warning: debug build, performance may be affected\n"); +#endif + +#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__) + fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n"); +#endif + + // initialize backends + ggml_backend_load_all(); + + cmd_params params = parse_cmd_params(argc, argv); + + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); + return 1; + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); + + // initialize llama.cpp + if (!params.verbose) { + llama_log_set(llama_null_log_callback, NULL); + } + llama_backend_init(); + llama_numa_init(params.numa); + + if (!set_process_priority(params.prio)) { + fprintf(stderr, "%s: error: failed to set process priority\n", __func__); + return 1; + } + + // initialize printer + std::unique_ptr p = create_printer(params.output_format); + std::unique_ptr p_err = create_printer(params.output_format_stderr); + + if (p) { + p->fout = stdout; + p->print_header(params); + } + + if (p_err) { + p_err->fout = stderr; + p_err->print_header(params); + } + + std::vector params_instances = get_cmd_params_instances(params); + + llama_model * lmodel = nullptr; + const cmd_params_instance * prev_inst = nullptr; + + // store the llama_context state at the previous depth that we performed a test + // ref: https://github.com/ggml-org/llama.cpp/pull/16944#issuecomment-3478151721 + ctx_state cstate; + + int params_idx = 0; + auto params_count = params_instances.size(); + for (const auto & inst : params_instances) { + params_idx++; + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); + } + // keep the same model between tests when possible + if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { + if (lmodel) { + llama_model_free(lmodel); + } + + lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); + if (lmodel == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); + return 1; + } + prev_inst = &inst; + } + + llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams()); + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); + llama_model_free(lmodel); + return 1; + } + + test t(inst, lmodel, ctx); + + llama_memory_clear(llama_get_memory(ctx), false); + + // cool off before the test + if (params.delay) { + std::this_thread::sleep_for(std::chrono::seconds(params.delay)); + } + + struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads); + if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) { + fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str()); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } + tpp.strict_cpu = t.cpu_strict; + tpp.poll = t.poll; + tpp.prio = params.prio; + + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); + if (!threadpool) { + fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } + + llama_attach_threadpool(ctx, threadpool, NULL); + + // warmup run + if (!params.no_warmup) { + if (t.n_prompt > 0) { + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count); + } + //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); + bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); + if (!res) { + fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } + } + if (t.n_gen > 0) { + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count); + } + bool res = test_gen(ctx, 1, t.n_threads); + if (!res) { + fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } + } + } + + for (int i = 0; i < params.reps; i++) { + llama_memory_clear(llama_get_memory(ctx), false); + + if (t.n_depth > 0) { + bool is_cached = t.n_depth == cstate.depth; + + if (is_cached) { + // if previously we have computed at this depth, just restore the state + const size_t ret = llama_state_seq_set_data(ctx, cstate.buf.data(), cstate.buf.size(), 0); + if (ret == 0) { + // if the old state is incompatible with the current context - reprocess from scratch + is_cached = false; + } + } + + if (!is_cached) { + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count, + i + 1, params.reps); + } + bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); + if (!res) { + fprintf(stderr, "%s: error: failed to run depth\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } + + // store the context state for reuse in later runs + cstate.depth = t.n_depth; + cstate.buf.resize(llama_state_seq_get_size(ctx, 0)); + llama_state_seq_get_data(ctx, cstate.buf.data(), cstate.buf.size(), 0); + } else { + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d (cached)\n", params_idx, params_count, + i + 1, params.reps); + } + } + } + + uint64_t t_start = get_time_ns(); + + if (t.n_prompt > 0) { + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count, + i + 1, params.reps); + } + bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads); + if (!res) { + fprintf(stderr, "%s: error: failed to run prompt\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } + } + if (t.n_gen > 0) { + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count, + i + 1, params.reps); + } + bool res = test_gen(ctx, t.n_gen, t.n_threads); + if (!res) { + fprintf(stderr, "%s: error: failed to run gen\n", __func__); + llama_free(ctx); + llama_model_free(lmodel); + exit(1); + } + } + + uint64_t t_ns = get_time_ns() - t_start; + t.samples_ns.push_back(t_ns); + } + + if (p) { + p->print_test(t); + fflush(p->fout); + } + + if (p_err) { + p_err->print_test(t); + fflush(p_err->fout); + } + + llama_perf_context_print(ctx); + + llama_free(ctx); + + ggml_threadpool_free_fn(threadpool); + } + + llama_model_free(lmodel); + + if (p) { + p->print_footer(); + } + + if (p_err) { + p_err->print_footer(); + } + + llama_backend_free(); + + return 0; +} diff --git a/llama.cpp/tools/mtmd/CMakeLists.txt b/llama.cpp/tools/mtmd/CMakeLists.txt new file mode 100644 index 0000000..02d71f2 --- /dev/null +++ b/llama.cpp/tools/mtmd/CMakeLists.txt @@ -0,0 +1,96 @@ +# mtmd + +find_package(Threads REQUIRED) + +add_library(mtmd + mtmd.cpp + mtmd-audio.cpp + mtmd.h + mtmd-helper.cpp + mtmd-helper.h + clip.cpp + clip.h + clip-impl.h + clip-model.h + clip-graph.h + models/models.h + models/cogvlm.cpp + models/conformer.cpp + models/glm4v.cpp + models/internvl.cpp + models/kimivl.cpp + models/kimik25.cpp + models/llama4.cpp + models/llava.cpp + models/minicpmv.cpp + models/pixtral.cpp + models/qwen2vl.cpp + models/qwen3vl.cpp + models/siglip.cpp + models/whisper-enc.cpp + models/mobilenetv5.cpp + models/youtuvl.cpp + ) + +set_target_properties(mtmd PROPERTIES + VERSION ${LLAMA_INSTALL_VERSION} + SOVERSION 0 + MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number +) + +target_link_libraries (mtmd PUBLIC ggml llama) +target_link_libraries (mtmd PRIVATE Threads::Threads) +target_include_directories(mtmd PUBLIC .) +target_include_directories(mtmd PRIVATE ../..) +target_include_directories(mtmd PRIVATE ../../vendor) +target_compile_features (mtmd PRIVATE cxx_std_17) + +if (BUILD_SHARED_LIBS) + set_target_properties (mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(mtmd PRIVATE LLAMA_BUILD) + target_compile_definitions(mtmd PUBLIC LLAMA_SHARED) +endif() + +set(MTMD_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/mtmd.h + ${CMAKE_CURRENT_SOURCE_DIR}/mtmd-helper.h + ) + +set_target_properties(mtmd + PROPERTIES + PUBLIC_HEADER "${MTMD_PUBLIC_HEADERS}") + +install(TARGETS mtmd LIBRARY PUBLIC_HEADER) + +if (NOT MSVC) + # for stb_image.h and miniaudio.h + target_compile_options(mtmd PRIVATE -Wno-cast-qual) +endif() + +if (TARGET BUILD_INFO) + add_dependencies(mtmd BUILD_INFO) + add_dependencies(mtmd-helper BUILD_INFO) +endif() + +# if mtmd is linked against common, we throw an error +if (TARGET mtmd) + get_target_property(libs mtmd LINK_LIBRARIES) + if (libs AND "common" IN_LIST libs) + message(FATAL_ERROR "mtmd is designed to be a public library.\n" + "It must not link against common") + endif() +endif() + +add_executable(llama-llava-cli deprecation-warning.cpp) +add_executable(llama-gemma3-cli deprecation-warning.cpp) +add_executable(llama-minicpmv-cli deprecation-warning.cpp) +add_executable(llama-qwen2vl-cli deprecation-warning.cpp) + +set(TARGET llama-mtmd-cli) +add_executable (${TARGET} mtmd-cli.cpp) +set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() +target_link_libraries (${TARGET} PRIVATE common mtmd Threads::Threads) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/llama.cpp/tools/mtmd/README.md b/llama.cpp/tools/mtmd/README.md new file mode 100644 index 0000000..ef31d19 --- /dev/null +++ b/llama.cpp/tools/mtmd/README.md @@ -0,0 +1,63 @@ +# Multimodal Support in llama.cpp + +This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported. + +> [!IMPORTANT] +> +> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**. + +The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify: + +- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction. +- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure. +- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users. +- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs. +- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`. + +## Pre-quantized models + +See the list of pre-quantized model [here](../../docs/multimodal.md) + +## How it works and what is `mmproj`? + +Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model. + +This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging. + +Consequently, running a multimodal model typically requires two GGUF files: +1. The standard language model file. +2. A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection. + +## What is `libmtmd`? + +As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs. + +Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages: +- **Unified Interface:** Aims to consolidate interaction for various multimodal models. +- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library. +- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models. + +## How to obtain `mmproj` + +Multimodal projector (`mmproj`) files are specific to each model architecture. + +For the following models, you can use `convert_hf_to_gguf.py` with `--mmproj` flag to get the `mmproj` file: +- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) ; See the guide [here](../../docs/multimodal/gemma3.md) - Note: 1B variant does not have vision support +- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) +- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB)) +- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint +- Qwen 2 VL and Qwen 2.5 VL (from [Qwen](https://huggingface.co/Qwen)) +- [Mistral Small 3.1 24B](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503) +- InternVL 2.5 and InternVL 3 from [OpenGVLab](https://huggingface.co/OpenGVLab) (note: we don't support conversion of `InternVL3-*-hf` model, only non-HF version is supported ; `InternLM2Model` **text** model is not supported) + +For older models, please refer to the relevant guide for instructions on how to obtain or create them: + +NOTE: conversion scripts are located under `tools/mtmd/legacy-models` + +- [LLaVA](../../docs/multimodal/llava.md) +- [MobileVLM](../../docs/multimodal/MobileVLM.md) +- [GLM-Edge](../../docs/multimodal/glmedge.md) +- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md) +- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md) +- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md) +- [IBM Granite Vision](../../docs/multimodal/granitevision.md) diff --git a/llama.cpp/tools/mtmd/clip-graph.h b/llama.cpp/tools/mtmd/clip-graph.h new file mode 100644 index 0000000..4c7f750 --- /dev/null +++ b/llama.cpp/tools/mtmd/clip-graph.h @@ -0,0 +1,117 @@ +#pragma once + +#include "ggml.h" +#include "ggml-cpp.h" +#include "clip.h" +#include "clip-impl.h" +#include "clip-model.h" + +#include +#include + +#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS) + +struct clip_graph { + const clip_model & model; + const clip_hparams & hparams; + projector_type proj_type; + + // we only support single image per batch + const clip_image_f32 & img; + + const int patch_size; + const int n_patches_x; + const int n_patches_y; + const int n_patches; + const int n_embd; + const int n_head; + const int d_head; + const int n_layer; + const int n_mmproj_embd; + const float eps; + const float kq_scale; + const clip_flash_attn_type flash_attn_type; + + ggml_context_ptr ctx0_ptr; + ggml_context * ctx0; + ggml_cgraph * gf; + + clip_graph(clip_ctx * ctx, const clip_image_f32 & img); + + virtual ~clip_graph() = default; + virtual ggml_cgraph * build() = 0; + + // + // utility functions + // + void cb(ggml_tensor * cur0, const char * name, int il) const; + + // siglip2 naflex + ggml_tensor * resize_position_embeddings(uint32_t interpolation_mode = DEFAULT_INTERPOLATION_MODE); + + // build vision transformer (ViT) cgraph + // this function should cover most of the models + // if your model has specific features, you should probably duplicate this function + ggml_tensor * build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function add_pos); + + // build the input after conv2d (inp_raw --> patches) + // returns tensor with shape [n_embd, n_patches] + ggml_tensor * build_inp(); + + ggml_tensor * build_inp_raw(int channels = 3); + + ggml_tensor * build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const; + + ggml_tensor * build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const; + + ggml_tensor * build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const; + + // implementation of the 2D RoPE without adding a new op in ggml + // this is not efficient (use double the memory), but works on all backends + // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 + ggml_tensor * build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_a, // first half + ggml_tensor * pos_b, // second half + const float freq_base, + const bool interleave_freq + ); + + // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) + // support dynamic resolution + ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor); + + // Generic function to stack frames for audio processing + // Abstracts out the StackAudioFrames logic used by ultravox + ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed); +}; diff --git a/llama.cpp/tools/mtmd/clip-impl.h b/llama.cpp/tools/mtmd/clip-impl.h new file mode 100644 index 0000000..3bc93ea --- /dev/null +++ b/llama.cpp/tools/mtmd/clip-impl.h @@ -0,0 +1,582 @@ +#pragma once + +#include "ggml.h" +#include "gguf.h" +#include "clip.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// Internal header for clip.cpp + +#define MTMD_INTERNAL_HEADER + +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_HAS_AUDIO_ENC "clip.has_audio_encoder" +#define KEY_HAS_VISION_ENC "clip.has_vision_encoder" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_USE_SILU "clip.use_silu" + +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" + +// vision-specific +#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels" +#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels" +#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" +#define KEY_IS_DEEPSTACK_LAYERS "clip.vision.is_deepstack_layers" + +#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" +#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" +#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" +#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" +#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes" +#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" +#define KEY_MINICPMV_VERSION "clip.minicpmv_version" +#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" + +// audio-specific +#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities +#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" +#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor" + + +// +// tensor name constants +// + +#define TN_POS_EMBD "%s.position_embd.weight" +#define TN_CLASS_EMBD "v.class_embd" +#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat +#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" +#define TN_PATCH_BIAS "v.patch_embd.bias" +#define TN_NORM_EMBD "v.norm_embd.%s" +#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" +#define TN_ATTN_K "%s.blk.%d.attn_k.%s" +#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" +#define TN_ATTN_V "%s.blk.%d.attn_v.%s" +#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" +#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" +#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" +#define TN_FFN_UP "%s.blk.%d.ffn_up.%s" +#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" +#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm +#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm +#define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale +#define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale +#define TN_LN_PRE "%s.pre_ln.%s" +#define TN_LN_POST "%s.post_ln.%s" +#define TN_LLAVA_PROJ "mm.%d.%s" +#define TN_MM_UP "mm.up.%s" +#define TN_MM_GATE "mm.gate.%s" +#define TN_MM_DOWN "mm.down.%s" +#define TN_MM_POST_NORM "mm.post_norm.%s" +#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" +#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" +#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" +#define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MM_INP_NORM "mm.input_norm.weight" +#define TN_MM_INP_NORM_B "mm.input_norm.bias" +#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 +#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 +#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 +#define TN_MM_PATCH_MERGER "mm.patch_merger.%s" // mistral small 3.1, glm4v +#define TN_TOK_IMG_BREAK "v.token_embd.img_break" // pixtral +#define TN_TOK_GLM_BOI "adapter.boi" // glm-edge (these embeddings are not in text model) +#define TN_TOK_GLM_EOI "adapter.eoi" // glm-edge (these embeddings are not in text model) +#define TN_DEEPSTACK_NORM "v.deepstack.%d.norm.%s" // qwen3vl deepstack +#define TN_DEEPSTACK_FC1 "v.deepstack.%d.fc1.%s" // qwen3vl deepstack +#define TN_DEEPSTACK_FC2 "v.deepstack.%d.fc2.%s" // qwen3vl deepstack + +// mimicpmv +#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" +#define TN_MINICPMV_QUERY "resampler.query" +#define TN_MINICPMV_PROJ "resampler.proj.weight" +#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" +#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" +#define TN_MINICPMV_LN "resampler.ln_%s.%s" + +#define TN_GLM_ADAPER_CONV "adapter.conv.%s" +#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s" +#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s" +#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" +#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" +#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" + +// ultravox +#define TN_CONV1D "a.conv1d.%d.%s" +#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s" +#define TN_MM_AUDIO_FC "mm.a.fc.%s" // fully connected layer +#define TN_MM_NORM_PRE "mm.a.norm_pre.%s" +#define TN_MM_NORM_MID "mm.a.norm_mid.%s" + +// cogvlm +#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s" +#define TN_MM_H_TO_4H "mm.up.%s" +#define TN_MM_GATE "mm.gate.%s" +#define TN_MM_4H_TO_H "mm.down.%s" +#define TN_TOK_BOI "v.boi" +#define TN_TOK_EOI "v.eoi" + +// (conformer) lfm2 +#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" +#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" +#define TN_FFN_NORM_1 "%s.blk.%d.ffn_norm_1.%s" +#define TN_FFN_UP_1 "%s.blk.%d.ffn_up_1.%s" +#define TN_FFN_DOWN_1 "%s.blk.%d.ffn_down_1.%s" +#define TN_POS_BIAS_U "%s.blk.%d.pos_bias_u" +#define TN_POS_BIAS_V "%s.blk.%d.pos_bias_v" +#define TN_NORM_CONV "%s.blk.%d.norm_conv.%s" +#define TN_LINEAR_POS "%s.blk.%d.linear_pos.%s" +#define TN_CONV_DW "%s.blk.%d.conv_dw.%s" +#define TN_CONV_NORM "%s.blk.%d.conv_norm.%s" +#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s" +#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s" + +// mobilenetv5 (gemma3n) definitions +#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight" +#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias" +#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight" + +// Stage 0 Block (Edge Residual) +#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight" +#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight" +#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight" +#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight" + +// Stage 1+ Block (Universal Inverted Residual) +#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight" +#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight" +#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight" +#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight" +#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight" +#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight" +#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight" +#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight" +#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma" + +// Attention Components +#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight" +#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight" +#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight" +#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight" +#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight" +#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight" +#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight" +#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight" +#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks + +// MSFA +#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight" +#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight" +#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight" +#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight" +#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight" + + +// align x to upper multiple of n +#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) + +// forward declaration +// TODO: improve this later +struct clip_ctx; + +enum projector_type { + PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_MLP_NORM, + PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_LDPV2, + PROJECTOR_TYPE_MINICPMV, + PROJECTOR_TYPE_GLM_EDGE, + PROJECTOR_TYPE_QWEN2VL, + PROJECTOR_TYPE_QWEN3VL, + PROJECTOR_TYPE_GEMMA3, + PROJECTOR_TYPE_GEMMA3NV, + PROJECTOR_TYPE_GEMMA3NA, + PROJECTOR_TYPE_IDEFICS3, + PROJECTOR_TYPE_PIXTRAL, + PROJECTOR_TYPE_QWEN25VL, + PROJECTOR_TYPE_ULTRAVOX, + PROJECTOR_TYPE_INTERNVL, + PROJECTOR_TYPE_LLAMA4, + PROJECTOR_TYPE_QWEN2A, + PROJECTOR_TYPE_GLMA, + PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx + PROJECTOR_TYPE_VOXTRAL, + PROJECTOR_TYPE_MUSIC_FLAMINGO, + PROJECTOR_TYPE_LFM2, + PROJECTOR_TYPE_KIMIVL, + PROJECTOR_TYPE_LIGHTONOCR, + PROJECTOR_TYPE_COGVLM, + PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_LFM2A, + PROJECTOR_TYPE_GLM4V, + PROJECTOR_TYPE_YOUTUVL, + PROJECTOR_TYPE_KIMIK25, + PROJECTOR_TYPE_UNKNOWN, +}; + +static std::map PROJECTOR_TYPE_NAMES = { + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2"}, + { PROJECTOR_TYPE_MINICPMV, "resampler"}, + { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, + { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, + { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, + { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, + { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, + { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, + { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, + { PROJECTOR_TYPE_INTERNVL, "internvl"}, + { PROJECTOR_TYPE_LLAMA4, "llama4"}, + { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, + { PROJECTOR_TYPE_GLMA, "glma"}, + { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, + { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, + { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"}, + { PROJECTOR_TYPE_LFM2, "lfm2"}, + { PROJECTOR_TYPE_KIMIVL, "kimivl"}, + { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, + { PROJECTOR_TYPE_COGVLM, "cogvlm"}, + { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_LFM2A, "lfm2a"}, + { PROJECTOR_TYPE_GLM4V, "glm4v"}, + { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, + { PROJECTOR_TYPE_KIMIK25, "kimik25"}, +}; + +static projector_type clip_projector_type_from_string(const std::string & str) { + for (const auto & pair : PROJECTOR_TYPE_NAMES) { + if (pair.second == str) { + return pair.first; + } + } + return PROJECTOR_TYPE_UNKNOWN; +} + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector buf; +}; + +// For images, buf.size() == nx*ny*3 +// Memory layout: RGBRGBRGB... +// For audio, only one channel is used, buf.size() == nx*ny +// nx will be n_frames and ny will be n_mel +struct clip_image_f32 { + int nx; + int ny; + + std::vector buf; +}; + +// +// logging +// + +static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + +struct clip_logger_state { + ggml_log_callback log_callback; + void * log_callback_user_data; +}; + +extern struct clip_logger_state g_logger_state; + +static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) { + if (format == NULL) { + return; + } + va_list args_copy; + va_copy(args_copy, args); + char buffer[128]; + int len = vsnprintf(buffer, 128, format, args); + if (len < 128) { + g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data); + } else { + char * buffer2 = (char *) calloc(len + 1, sizeof(char)); + vsnprintf(buffer2, len + 1, format, args_copy); + buffer2[len] = 0; + g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data); + free(buffer2); + } + va_end(args_copy); +} + +static void clip_log_internal(enum ggml_log_level level, const char * format, ...) { + va_list args; + va_start(args, format); + clip_log_internal_v(level, format, args); + va_end(args); +} + +#define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) +#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) +#define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__) + +// +// cpp wrappers +// + +// wrapper for clip_image_size +struct clip_image_size_deleter { + void operator()(clip_image_size * val) { clip_image_size_free(val); } +}; +typedef std::unique_ptr clip_image_size_ptr; + +// wrapper for clip_image_u8 +struct clip_image_u8_deleter { + void operator()(clip_image_u8 * val) { clip_image_u8_free(val); } +}; +typedef std::unique_ptr clip_image_u8_ptr; + +// wrapper for clip_image_f32 +struct clip_image_f32_deleter { + void operator()(clip_image_f32 * val) { clip_image_f32_free(val); } +}; +typedef std::unique_ptr clip_image_f32_ptr; + +struct clip_image_u8_batch { + std::vector entries; +}; + +struct clip_image_f32_batch { + std::vector entries; + bool is_audio = false; + + // for llava-uhd style models, we need to know the grid size + // note: entries.size() == grid_x * grid_y + 1 (one overview image) + int grid_x = 0; + int grid_y = 0; + + clip_image_f32_batch clone() const { + clip_image_f32_batch new_batch{ + /* entries */ {}, + /* is_audio */ is_audio, + /* grid_x */ grid_x, + /* grid_y */ grid_y, + }; + new_batch.entries.reserve(entries.size()); + for (const auto & entry : entries) { + new_batch.entries.emplace_back(new clip_image_f32(*entry)); + } + return new_batch; + } +}; + +// +// common utils +// + +static std::string string_format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), buf.size()); +} + +static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; + } + std::string builder; + builder.reserve(s.length()); + size_t pos = 0; + size_t last_pos = 0; + while ((pos = s.find(search, last_pos)) != std::string::npos) { + builder.append(s, last_pos, pos - last_pos); + builder.append(replace); + last_pos = pos + search.length(); + } + builder.append(s, last_pos, std::string::npos); + s = std::move(builder); +} + +// split string by a `std::string delim` instead of `char delim` +static std::vector string_split_str(std::string s, const std::string & delimiter) { + std::vector tokens; + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + tokens.push_back(token); + s.erase(0, pos + delimiter.length()); + } + tokens.push_back(s); + return tokens; +} + +// +// gguf utils +// + +static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { + switch (type) { + case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]); + case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]); + case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]); + case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]); + case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]); + case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]); + case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]); + case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]); + case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]); + case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]); + case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false"; + default: return string_format("unknown type %d", type); + } +} + +static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { + const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + + switch (type) { + case GGUF_TYPE_STRING: + return gguf_get_val_str(ctx_gguf, i); + case GGUF_TYPE_ARRAY: + { + const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i); + int arr_n = gguf_get_arr_n(ctx_gguf, i); + const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i); + std::stringstream ss; + ss << "["; + for (int j = 0; j < arr_n; j++) { + if (arr_type == GGUF_TYPE_STRING) { + std::string val = gguf_get_arr_str(ctx_gguf, i, j); + // escape quotes + string_replace_all(val, "\\", "\\\\"); + string_replace_all(val, "\"", "\\\""); + ss << '"' << val << '"'; + } else if (arr_type == GGUF_TYPE_ARRAY) { + ss << "???"; + } else { + ss << gguf_data_to_str(arr_type, data, j); + } + if (j < arr_n - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + } + default: + return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0); + } +} + +// +// debugging +// + +static void print_tensor_shape(ggml_tensor * t) { + printf("%s.shape = [", t->name); + for (int i = 0; i < ggml_n_dims(t); ++i) { + printf("%" PRId64, t->ne[i]); + if (i < ggml_n_dims(t) - 1) { + printf(", "); + } + } + printf("]\n"); +} + +static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) { + ggml_type type = t->type; + int64_t * ne = t->ne; + size_t * nb = t->nb; + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + printf("%s.data: [\n", t->name); + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + if (i2 == n && ne[2] > 2*n) { + printf(" ..., \n"); + i2 = ne[2] - n; + } + printf(" [\n"); + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + if (i1 == n && ne[1] > 2*n) { + printf(" ..., \n"); + i1 = ne[1] - n; + } + printf(" ["); + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + if (i0 == n && ne[0] > 2*n) { + printf("..., "); + i0 = ne[0] - n; + } + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); + } else if (type == GGML_TYPE_F32) { + v = *(float *) &data[i]; + } else if (type == GGML_TYPE_I32) { + v = (float) *(int32_t *) &data[i]; + } else if (type == GGML_TYPE_I16) { + v = (float) *(int16_t *) &data[i]; + } else if (type == GGML_TYPE_I8) { + v = (float) *(int8_t *) &data[i]; + } else { + GGML_ABORT("fatal error"); + } + printf("%8.4f", v); + if (i0 < ne[0] - 1) printf(", "); + } + printf("],\n"); + } + printf(" ],\n"); + } + printf(" ]\n"); + } +} + +void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value); + +// +// API used internally with mtmd +// + +projector_type clip_get_projector_type(const struct clip_ctx * ctx); diff --git a/llama.cpp/tools/mtmd/clip-model.h b/llama.cpp/tools/mtmd/clip-model.h new file mode 100644 index 0000000..d4ff915 --- /dev/null +++ b/llama.cpp/tools/mtmd/clip-model.h @@ -0,0 +1,389 @@ +#pragma once + +#include "ggml.h" +#include "clip.h" +#include "clip-impl.h" + +#include +#include +#include +#include +#include + +enum ffn_op_type { + FFN_GELU, + FFN_GELU_ERF, + FFN_SILU, + FFN_GELU_QUICK, +}; + +enum norm_type { + NORM_TYPE_NORMAL, + NORM_TYPE_RMS, +}; + +enum patch_merge_type { + PATCH_MERGE_FLAT, + PATCH_MERGE_SPATIAL_UNPAD, +}; + +struct clip_hparams { + int32_t image_size = 0; + int32_t patch_size = 0; + int32_t n_embd = 0; + int32_t n_ff = 0; + int32_t projection_dim = 0; + int32_t n_head = 0; + int32_t n_layer = 0; + // idefics3 + int32_t image_longest_edge = 0; + int32_t image_min_pixels = -1; + int32_t image_max_pixels = -1; + int32_t n_merge = 0; // number of patch merges **per-side** + + float image_mean[3]; + float image_std[3]; + + // for models using dynamic image size, we need to have a smaller image size to warmup + // otherwise, user will get OOM everytime they load the model + int32_t warmup_image_size = 0; + int32_t warmup_audio_size = 3000; + + ffn_op_type ffn_op = FFN_GELU; + + patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; + + float eps = 1e-6; + float rope_theta = 0.0; + + std::vector image_res_candidates; // for llava-uhd style models + int32_t image_crop_resolution; + std::unordered_set vision_feature_layer; + int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; + std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) + + // audio + int32_t n_mel_bins = 0; // whisper preprocessor + int32_t proj_stack_factor = 0; // ultravox + + // audio-to-mel preprocessor params + int32_t audio_chunk_len = -1; // in seconds + int32_t audio_sample_rate = -1; + int32_t audio_n_fft = -1; + int32_t audio_window_len = -1; + int32_t audio_hop_len = -1; + + // legacy + bool has_llava_projector = false; + int minicpmv_version = 0; + int32_t minicpmv_query_num = 0; // MiniCPM-V query number + + // custom value provided by user, can be undefined if not set + int32_t custom_image_min_tokens = -1; + int32_t custom_image_max_tokens = -1; + + void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { + const int cur_merge = n_merge == 0 ? 1 : n_merge; + const int patch_area = patch_size * patch_size * cur_merge * cur_merge; + image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area; + image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area; + warmup_image_size = static_cast(std::sqrt(image_max_pixels)); + } + + void set_warmup_n_tokens(int n_tokens) { + int n_tok_per_side = static_cast(std::sqrt(n_tokens)); + GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); + const int cur_merge = n_merge == 0 ? 1 : n_merge; + warmup_image_size = n_tok_per_side * patch_size * cur_merge; + // TODO: support warmup size for custom token numbers + } +}; + +struct clip_layer { + // attention + ggml_tensor * k_w = nullptr; + ggml_tensor * k_b = nullptr; + ggml_tensor * q_w = nullptr; + ggml_tensor * q_b = nullptr; + ggml_tensor * v_w = nullptr; + ggml_tensor * v_b = nullptr; + ggml_tensor * qkv_w = nullptr; + ggml_tensor * qkv_b = nullptr; + + ggml_tensor * o_w = nullptr; + ggml_tensor * o_b = nullptr; + + ggml_tensor * k_norm = nullptr; + ggml_tensor * q_norm = nullptr; + + // layernorm 1 + ggml_tensor * ln_1_w = nullptr; + ggml_tensor * ln_1_b = nullptr; + + ggml_tensor * ff_up_w = nullptr; + ggml_tensor * ff_up_b = nullptr; + ggml_tensor * ff_gate_w = nullptr; + ggml_tensor * ff_gate_b = nullptr; + ggml_tensor * ff_down_w = nullptr; + ggml_tensor * ff_down_b = nullptr; + + // layernorm 2 + ggml_tensor * ln_2_w = nullptr; + ggml_tensor * ln_2_b = nullptr; + + // layer scale (no bias) + ggml_tensor * ls_1_w = nullptr; + ggml_tensor * ls_2_w = nullptr; + + // qwen3vl deepstack merger + ggml_tensor * deepstack_norm_w = nullptr; + ggml_tensor * deepstack_norm_b = nullptr; + ggml_tensor * deepstack_fc1_w = nullptr; + ggml_tensor * deepstack_fc1_b = nullptr; + ggml_tensor * deepstack_fc2_w = nullptr; + ggml_tensor * deepstack_fc2_b = nullptr; + + // lfm2 + ggml_tensor * ff_norm_w = nullptr; + ggml_tensor * ff_norm_b = nullptr; + ggml_tensor * ff_norm_1_w = nullptr; + ggml_tensor * ff_norm_1_b = nullptr; + ggml_tensor * ff_up_1_w = nullptr; + ggml_tensor * ff_up_1_b = nullptr; + ggml_tensor * ff_down_1_w = nullptr; + ggml_tensor * ff_down_1_b = nullptr; + ggml_tensor * pos_bias_u = nullptr; + ggml_tensor * pos_bias_v = nullptr; + ggml_tensor * norm_conv_w = nullptr; + ggml_tensor * norm_conv_b = nullptr; + ggml_tensor * linear_pos_w = nullptr; + + ggml_tensor * conv_norm_w = nullptr; + ggml_tensor * conv_norm_b = nullptr; + ggml_tensor * conv_dw_w = nullptr; + ggml_tensor * conv_dw_b = nullptr; + ggml_tensor * conv_pw1_w = nullptr; + ggml_tensor * conv_pw1_b = nullptr; + ggml_tensor * conv_pw2_w = nullptr; + ggml_tensor * conv_pw2_b = nullptr; + + bool has_deepstack() const { + return deepstack_fc1_w != nullptr; + } +}; + +// Expanded MobileNetV5 block structure for Gemma3n vision encoder +struct mobilenetv5_block { + // Stage 0 (Edge Residual) + ggml_tensor * s0_conv_exp_w = nullptr; + ggml_tensor * s0_bn1_w = nullptr; + ggml_tensor * s0_conv_pwl_w = nullptr; + ggml_tensor * s0_bn2_w = nullptr; + + // Stage 1+ (Universal Inverted Residual) + ggml_tensor * dw_start_w = nullptr; + ggml_tensor * dw_start_bn_w = nullptr; + + ggml_tensor * pw_exp_w = nullptr; + ggml_tensor * pw_exp_bn_w = nullptr; + + ggml_tensor * dw_mid_w = nullptr; + ggml_tensor * dw_mid_bn_w = nullptr; + + ggml_tensor * pw_proj_w = nullptr; + ggml_tensor * pw_proj_bn_w = nullptr; + + ggml_tensor * layer_scale_w = nullptr; + + // Attention (MQA) components + ggml_tensor * attn_q_w = nullptr; + ggml_tensor * attn_k_w = nullptr; + ggml_tensor * attn_v_w = nullptr; + ggml_tensor * attn_o_w = nullptr; + + // Optional downsampling/norm in attention + ggml_tensor * attn_k_dw_w = nullptr; + ggml_tensor * attn_k_norm_w = nullptr; + ggml_tensor * attn_v_dw_w = nullptr; + ggml_tensor * attn_v_norm_w = nullptr; + + // Block norm (often present in attention blocks) + ggml_tensor * attn_norm_w = nullptr; +}; + +struct clip_model { + clip_modality modality = CLIP_MODALITY_VISION; + projector_type proj_type = PROJECTOR_TYPE_MLP; + clip_hparams hparams; + + // embeddings + ggml_tensor * class_embedding = nullptr; + ggml_tensor * patch_embeddings_0 = nullptr; + ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) + ggml_tensor * patch_bias = nullptr; + ggml_tensor * position_embeddings = nullptr; + ggml_tensor * norm_embd_w = nullptr; + ggml_tensor * norm_embd_b = nullptr; + + ggml_tensor * pre_ln_w = nullptr; + ggml_tensor * pre_ln_b = nullptr; + + std::vector layers; + + int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer + + ggml_tensor * post_ln_w; + ggml_tensor * post_ln_b; + + ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) + ggml_tensor * mm_fc_w; + ggml_tensor * mm_fc_b; + ggml_tensor * mm_ffn_up_w = nullptr; + ggml_tensor * mm_ffn_up_b = nullptr; + ggml_tensor * mm_ffn_gate_w = nullptr; + ggml_tensor * mm_ffn_gate_b = nullptr; + ggml_tensor * mm_ffn_down_w = nullptr; + ggml_tensor * mm_ffn_down_b = nullptr; + ggml_tensor * mm_post_norm_w = nullptr; + ggml_tensor * mm_post_norm_b = nullptr; + + // LLaVA projection + ggml_tensor * mm_input_norm_w = nullptr; + ggml_tensor * mm_input_norm_b = nullptr; + ggml_tensor * mm_0_w = nullptr; + ggml_tensor * mm_0_b = nullptr; + ggml_tensor * mm_2_w = nullptr; + ggml_tensor * mm_2_b = nullptr; + + ggml_tensor * image_newline = nullptr; + + // Yi type models with mlp+normalization projection + ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 + ggml_tensor * mm_1_b = nullptr; + ggml_tensor * mm_3_w = nullptr; + ggml_tensor * mm_3_b = nullptr; + ggml_tensor * mm_4_w = nullptr; + ggml_tensor * mm_4_b = nullptr; + + // GLMV-Edge projection + ggml_tensor * mm_model_adapter_conv_w = nullptr; + ggml_tensor * mm_model_adapter_conv_b = nullptr; + + // MobileVLM projection + ggml_tensor * mm_model_mlp_1_w = nullptr; + ggml_tensor * mm_model_mlp_1_b = nullptr; + ggml_tensor * mm_model_mlp_3_w = nullptr; + ggml_tensor * mm_model_mlp_3_b = nullptr; + ggml_tensor * mm_model_block_1_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_1_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_2_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_b = nullptr; + + // MobileVLM_V2 projection + ggml_tensor * mm_model_mlp_0_w = nullptr; + ggml_tensor * mm_model_mlp_0_b = nullptr; + ggml_tensor * mm_model_mlp_2_w = nullptr; + ggml_tensor * mm_model_mlp_2_b = nullptr; + ggml_tensor * mm_model_peg_0_w = nullptr; + ggml_tensor * mm_model_peg_0_b = nullptr; + + // MINICPMV projection + ggml_tensor * mm_model_pos_embed_k = nullptr; + ggml_tensor * mm_model_query = nullptr; + ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_kv_proj = nullptr; + ggml_tensor * mm_model_attn_q_w = nullptr; + ggml_tensor * mm_model_attn_q_b = nullptr; + ggml_tensor * mm_model_attn_k_w = nullptr; + ggml_tensor * mm_model_attn_k_b = nullptr; + ggml_tensor * mm_model_attn_v_w = nullptr; + ggml_tensor * mm_model_attn_v_b = nullptr; + ggml_tensor * mm_model_attn_o_w = nullptr; + ggml_tensor * mm_model_attn_o_b = nullptr; + ggml_tensor * mm_model_ln_q_w = nullptr; + ggml_tensor * mm_model_ln_q_b = nullptr; + ggml_tensor * mm_model_ln_kv_w = nullptr; + ggml_tensor * mm_model_ln_kv_b = nullptr; + ggml_tensor * mm_model_ln_post_w = nullptr; + ggml_tensor * mm_model_ln_post_b = nullptr; + + // gemma3 + ggml_tensor * mm_input_proj_w = nullptr; + ggml_tensor * mm_soft_emb_norm_w = nullptr; + + // mobilenetv5 for gemma3n + std::vector mobilenet_blocks; + std::vector mobilenet_stage_ends; + ggml_tensor * mobilenet_stem_conv_w = nullptr; + ggml_tensor * mobilenet_stem_conv_b = nullptr; + ggml_tensor * mobilenet_stem_norm_w = nullptr; + ggml_tensor * mm_post_proj_norm_w = nullptr; + + // Multi-Scale Fusion Adapter (MSFA) components + ggml_tensor * msfa_concat_conv_w = nullptr; + ggml_tensor * msfa_concat_norm_w = nullptr; + ggml_tensor * msfa_ffn_expand_w = nullptr; + ggml_tensor * msfa_ffn_project_w = nullptr; + ggml_tensor * msfa_ffn_expand_bn = nullptr; + ggml_tensor * msfa_ffn_project_bn = nullptr; + + + // pixtral, glm4v + ggml_tensor * token_embd_img_break = nullptr; + ggml_tensor * mm_patch_merger_w = nullptr; + ggml_tensor * mm_patch_merger_b = nullptr; + + // ultravox / whisper encoder + ggml_tensor * conv1d_1_w = nullptr; + ggml_tensor * conv1d_1_b = nullptr; + ggml_tensor * conv1d_2_w = nullptr; + ggml_tensor * conv1d_2_b = nullptr; + ggml_tensor * mm_norm_pre_w = nullptr; + ggml_tensor * mm_norm_pre_b = nullptr; + ggml_tensor * mm_norm_mid_w = nullptr; + + // cogvlm + ggml_tensor * mm_post_fc_norm_w = nullptr; + ggml_tensor * mm_post_fc_norm_b = nullptr; + ggml_tensor * mm_h_to_4h_w = nullptr; + ggml_tensor * mm_gate_w = nullptr; + ggml_tensor * mm_4h_to_h_w = nullptr; + ggml_tensor * mm_boi = nullptr; + ggml_tensor * mm_eoi = nullptr; + + // lfm2 audio + std::array pre_encode_conv_X_w = {nullptr}; + std::array pre_encode_conv_X_b = {nullptr}; + ggml_tensor * pre_encode_out_w = nullptr; + ggml_tensor * pre_encode_out_b = nullptr; + + bool audio_has_avgpool() const { + return proj_type == PROJECTOR_TYPE_QWEN2A + || proj_type == PROJECTOR_TYPE_VOXTRAL + || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO; + } + + bool audio_has_stack_frames() const { + return proj_type == PROJECTOR_TYPE_ULTRAVOX + || proj_type == PROJECTOR_TYPE_VOXTRAL; + } +}; + +const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx); diff --git a/llama.cpp/tools/mtmd/clip.cpp b/llama.cpp/tools/mtmd/clip.cpp new file mode 100644 index 0000000..eeccb4c --- /dev/null +++ b/llama.cpp/tools/mtmd/clip.cpp @@ -0,0 +1,4080 @@ +#include "clip.h" +#include "clip-impl.h" +#include "clip-model.h" +#include "clip-graph.h" +#include "models/models.h" + +#include "ggml.h" +#include "ggml-cpp.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL}; + +//#define CLIP_DEBUG_FUNCTIONS + +#ifdef CLIP_DEBUG_FUNCTIONS +static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + // PPM header: P6 format, width, height, and max color value + file << "P6\n" << img.nx << " " << img.ny << "\n255\n"; + + // Write pixel data + for (size_t i = 0; i < img.buf.size(); i += 3) { + // PPM expects binary data in RGB format, which matches our image buffer + file.write(reinterpret_cast(&img.buf[i]), 3); + } + + file.close(); +} + +static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) { + LOG_ERR("Failed to open file for writing: %s\n", filename.c_str()); + return; + } + + int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data + int bytesPerPixel = 3; + int widthInBytes = img.nx * bytesPerPixel; + int paddingAmount = (4 - (widthInBytes % 4)) % 4; + int stride = widthInBytes + paddingAmount; + + // Bitmap file header + unsigned char fileHeader[14] = { + 'B','M', // Signature + 0,0,0,0, // Image file size in bytes + 0,0,0,0, // Reserved + 54,0,0,0 // Start of pixel array + }; + + // Total file size + fileSize = 54 + (stride * img.ny); + fileHeader[2] = (unsigned char)(fileSize); + fileHeader[3] = (unsigned char)(fileSize >> 8); + fileHeader[4] = (unsigned char)(fileSize >> 16); + fileHeader[5] = (unsigned char)(fileSize >> 24); + + // Bitmap information header (BITMAPINFOHEADER) + unsigned char infoHeader[40] = { + 40,0,0,0, // Size of this header (40 bytes) + 0,0,0,0, // Image width + 0,0,0,0, // Image height + 1,0, // Number of color planes + 24,0, // Bits per pixel + 0,0,0,0, // No compression + 0,0,0,0, // Image size (can be 0 for no compression) + 0,0,0,0, // X pixels per meter (not specified) + 0,0,0,0, // Y pixels per meter (not specified) + 0,0,0,0, // Total colors (color table not used) + 0,0,0,0 // Important colors (all are important) + }; + + // Width and height in the information header + infoHeader[4] = (unsigned char)(img.nx); + infoHeader[5] = (unsigned char)(img.nx >> 8); + infoHeader[6] = (unsigned char)(img.nx >> 16); + infoHeader[7] = (unsigned char)(img.nx >> 24); + infoHeader[8] = (unsigned char)(img.ny); + infoHeader[9] = (unsigned char)(img.ny >> 8); + infoHeader[10] = (unsigned char)(img.ny >> 16); + infoHeader[11] = (unsigned char)(img.ny >> 24); + + // Write file headers + file.write(reinterpret_cast(fileHeader), sizeof(fileHeader)); + file.write(reinterpret_cast(infoHeader), sizeof(infoHeader)); + + // Pixel data + std::vector padding(3, 0); // Max padding size to be added to each row + for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top + for (int x = 0; x < img.nx; ++x) { + // Each pixel + size_t pixelIndex = (y * img.nx + x) * 3; + unsigned char pixel[3] = { + img.buf[pixelIndex + 2], // BMP stores pixels in BGR format + img.buf[pixelIndex + 1], + img.buf[pixelIndex] + }; + file.write(reinterpret_cast(pixel), 3); + } + // Write padding for the row + file.write(reinterpret_cast(padding.data()), paddingAmount); + } + + file.close(); +} + +// debug function to convert f32 to u8 +static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(3 * src.nx * src.ny); + for (size_t i = 0; i < src.buf.size(); ++i) { + dst.buf[i] = static_cast(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255)); + } +} +#endif + + +struct clip_ctx { + clip_model model; + + gguf_context_ptr ctx_gguf; + ggml_context_ptr ctx_data; + + std::vector buf_compute_meta; + + std::vector backend_ptrs; + std::vector backend_buft; + + ggml_backend_t backend = nullptr; + ggml_backend_t backend_cpu = nullptr; + ggml_backend_buffer_ptr buf; + + + int max_nodes = 8192; + ggml_backend_sched_ptr sched; + clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO; + bool is_allocated = false; + + clip_ctx(clip_context_params & ctx_params) { + flash_attn_type = ctx_params.flash_attn_type; + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + if (!backend_cpu) { + throw std::runtime_error("failed to initialize CPU backend"); + } + if (ctx_params.use_gpu) { + auto backend_name = std::getenv("MTMD_BACKEND_DEVICE"); + if (backend_name != nullptr) { + backend = ggml_backend_init_by_name(backend_name, nullptr); + if (!backend) { + LOG_WRN("%s: Warning: Failed to initialize \"%s\" backend, falling back to default GPU backend\n", __func__, backend_name); + } + } + if (!backend) { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr); + backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr); + } + } + + if (backend) { + LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend)); + backend_ptrs.push_back(backend); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); + } else { + backend = backend_cpu; + LOG_INF("%s: CLIP using CPU backend\n", __func__); + } + + if (ctx_params.image_min_tokens > 0) { + model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens; + } + if (ctx_params.image_max_tokens > 0) { + model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens; + } + + backend_ptrs.push_back(backend_cpu); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); + + sched.reset( + ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) + ); + + if (ctx_params.cb_eval != nullptr) { + ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data); + } + } + + ~clip_ctx() { + ggml_backend_free(backend); + if (backend != backend_cpu) { + ggml_backend_free(backend_cpu); + } + } + + // this function is added so that we don't change too much of the existing code + projector_type proj_type() const { + return model.proj_type; + } +}; + +// +// clip_graph +// + +clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : + model(ctx->model), + hparams(model.hparams), + proj_type(ctx->proj_type()), + img(img), + patch_size(hparams.patch_size), + n_patches_x(img.nx / patch_size), + n_patches_y(img.ny / patch_size), + n_patches(n_patches_x * n_patches_y), + n_embd(hparams.n_embd), + n_head(hparams.n_head), + d_head(n_embd / n_head), + n_layer(hparams.n_layer), + n_mmproj_embd(clip_n_mmproj_embd(ctx)), + eps(hparams.eps), + kq_scale(1.0f / sqrtf((float)d_head)), + flash_attn_type(ctx->flash_attn_type) { + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + ctx0_ptr.reset(ggml_init(params)); + ctx0 = ctx0_ptr.get(); + gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); +} + +void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); + } +} + +// siglip2 naflex +ggml_tensor * clip_graph::resize_position_embeddings(uint32_t interpolation_mode) { + ggml_tensor * pos_embd = model.position_embeddings; + const int height = img.ny / patch_size; + const int width = img.nx / patch_size; + const uint32_t mode = interpolation_mode; + const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); + + GGML_ASSERT(pos_embd); + + if (height == n_per_side && width == n_per_side) { + return pos_embd; + } + + pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side) + pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd) + pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd) + pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height) + pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height) + + return pos_embd; +} + +// build vision transformer (ViT) cgraph +// this function should cover most of the models +// if your model has specific features, you should probably duplicate this function +ggml_tensor * clip_graph::build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function add_pos + ) { + if (learned_pos_embd) { + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "pos_embed", -1); + } + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + cb(inpL, "pre_ln", -1); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); + + // self-attention + { + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; + if (layer.qkv_w != nullptr) { + // fused qkv + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + if (layer.qkv_b != nullptr) { + cur = ggml_add(ctx0, cur, layer.qkv_b); + } + + Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); + + Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); + + Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + + // TODO: q/k norm requires row size == n_embd, while here it's d_head + // we can add support in the future if needed + GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr); + + } else { + // separate q, k, v + Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + + if (layer.q_norm) { + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm", il); + } + + if (layer.k_norm) { + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + if (add_pos) { + Qcur = add_pos(Qcur, layer); + Kcur = add_pos(Kcur, layer); + cb(Qcur, "Qcur_pos", il); + cb(Kcur, "Kcur_pos", il); + } + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + if (layer.ls_1_w) { + cur = ggml_mul(ctx0, cur, layer.ls_1_w); + cb(cur, "attn_out_scaled", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + ffn_t, il); + + cb(cur, "ffn_out", il); + + if (layer.ls_2_w) { + cur = ggml_mul(ctx0, cur, layer.ls_2_w); + cb(cur, "ffn_out_scaled", il); + } + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + if (model.audio_has_avgpool()) { + ggml_tensor * cur = inpL; + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0); + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); + } + return inpL; +} + +// build the input after conv2d (inp_raw --> patches) +// returns tensor with shape [n_embd, n_patches] +ggml_tensor * clip_graph::build_inp() { + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + return inp; +} + +ggml_tensor * clip_graph::build_inp_raw(int channels) { + ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + return inp_raw; +} + +ggml_tensor * clip_graph::build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const { + + cur = type == NORM_TYPE_RMS + ? ggml_rms_norm(ctx0, cur, norm_eps) + : ggml_norm(ctx0, cur, norm_eps); + + if (mw) { + cur = ggml_mul(ctx0, cur, mw); + cb(cur, "norm_w", il); + } + + if (mb) { + cur = ggml_add(ctx0, cur, mb); + cb(cur, "norm_b", il); + } + + return cur; +} + +ggml_tensor * clip_graph::build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const { + + ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + cur = ggml_mul_mat(ctx0, gate, cur); + cb(cur, "ffn_gate", il); + + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } else { + cur = tmp; + } + + // we only support parallel ffn for now + switch (type_op) { + case FFN_SILU: + if (gate) { + cur = ggml_swiglu_split(ctx0, cur, tmp); + cb(cur, "ffn_swiglu", il); + } else { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case FFN_GELU: + if (gate) { + cur = ggml_geglu_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu", il); + } else { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + } break; + case FFN_GELU_ERF: + if (gate) { + cur = ggml_geglu_erf_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu_erf", il); + } else { + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "ffn_gelu_erf", il); + } break; + case FFN_GELU_QUICK: + if (gate) { + cur = ggml_geglu_quick_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu_quick", il); + } else { + cur = ggml_gelu_quick(ctx0, cur); + cb(cur, "ffn_gelu_quick", il); + } break; + } + + if (down) { + cur = ggml_mul_mat(ctx0, down, cur); + } + + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } + + return cur; +} + +ggml_tensor * clip_graph::build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); + + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + //cb(k, "k", il); + + ggml_tensor * cur; + + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); + + k = ggml_cast(ctx0, k, GGML_TYPE_F16); + v = ggml_cast(ctx0, v, GGML_TYPE_F16); + + cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f); + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); + + } else { + ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); + v = ggml_cont(ctx0, v); + + const auto n_tokens = q->ne[1]; + const auto n_head = q->ne[2]; + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // F32 may not needed for vision encoders? + // ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); + } + + cb(cur, "kqv_out", il); + + if (wo) { + cur = ggml_mul_mat(ctx0, wo, cur); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + +// implementation of the 2D RoPE without adding a new op in ggml +// this is not efficient (use double the memory), but works on all backends +// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 +ggml_tensor * clip_graph::build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_a, // first half + ggml_tensor * pos_b, // second half + const float freq_base, + const bool interleave_freq +) { + const int64_t n_dim = cur->ne[0]; + const int64_t n_head = cur->ne[1]; + const int64_t n_pos = cur->ne[2]; + + // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) + // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 + // first half of cur will use 1e-0, 1e-2 (even) + // second half of cur will use 1e-1, 1e-3 (odd) + // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even + // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) + // then for the second half, we use freq_scale to shift the inv_freq + // ^ why? replace (2i) with (2i+1) in the above equation + const float freq_scale_odd = interleave_freq + ? std::pow(freq_base, (float)-2/n_dim) + : 1.0; + + // first half + ggml_tensor * first; + { + first = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + cur->nb[1], + cur->nb[2], + 0); + first = ggml_rope_ext( + ctx0, + first, + pos_a, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + // second half + ggml_tensor * second; + { + second = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + cur->nb[1], + cur->nb[2], + n_dim/2 * ggml_element_size(cur)); + second = ggml_rope_ext( + ctx0, + second, + pos_b, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + freq_scale_odd, + 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + cur = ggml_concat(ctx0, first, second, 0); + return cur; +} + +// Generic function to stack frames for audio processing +// Abstracts out the StackAudioFrames logic used by ultravox +ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) { + if (stack_factor <= 1) { + return cur; + } + + int64_t total_elements = ggml_nelements(cur); + int64_t stride = n_embed * stack_factor; + + // Calculate padded length + int64_t padded_len = GGML_PAD(total_elements, stride); + int64_t pad = padded_len - total_elements; + + if (pad > 0) { + // Pad the tensor to make it divisible by stride + cur = ggml_view_1d(ctx0, cur, total_elements, 0); + cur = ggml_pad(ctx0, cur, pad, 0, 0, 0); + } + + // Reshape to [stride, padded_len / stride] + cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride, + ggml_row_size(cur->type, stride), 0); + return cur; +} + +// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) +// support dynamic resolution +ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { + GGML_ASSERT(scale_factor > 1); + + const int n_embd = cur->ne[0]; + int width = img.nx / patch_size; + int height = img.ny / patch_size; + + // pad width and height to factor + const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; + const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; + cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); + if (pad_width || pad_height) { + cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); + width += pad_width; + height += pad_height; + } + + // unshuffle h + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + // unshuffle w + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cb(cur, "pixel_shuffle", -1); + + return cur; +} + +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { + GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); + + const clip_image_f32 & img = *imgs.entries[0]; + std::unique_ptr builder; + + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_JANUS_PRO: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_GEMMA3NV: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_QWEN3VL: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_MINICPMV: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_INTERNVL: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_LLAMA4: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_KIMIVL: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_KIMIK25: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_COGVLM: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_GLM_EDGE: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_LFM2A: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_GLM4V: + { + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + builder = std::make_unique(ctx, img); + } break; + default: + GGML_ABORT("missing cgraph builder"); + } + + return builder->build(); +} + +// +// clip_model_loader +// + +struct clip_model_loader { + ggml_context_ptr ctx_meta; + gguf_context_ptr ctx_gguf; + + std::string fname; + + size_t model_size = 0; // in bytes + + bool has_vision = false; + bool has_audio = false; + + // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model + clip_model_loader(const char * fname) : fname(fname) { + struct ggml_context * meta = nullptr; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &meta, + }; + + ctx_gguf = gguf_context_ptr(gguf_init_from_file(fname, params)); + if (!ctx_gguf.get()) { + throw std::runtime_error(string_format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname)); + } + + ctx_meta.reset(meta); + + const int n_tensors = gguf_get_n_tensors(ctx_gguf.get()); + + // print gguf info + { + std::string name; + get_string(KEY_NAME, name, false); + std::string description; + get_string(KEY_DESCRIPTION, description, false); + LOG_INF("%s: model name: %s\n", __func__, name.c_str()); + LOG_INF("%s: description: %s\n", __func__, description.c_str()); + LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx_gguf.get())); + LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx_gguf.get())); + LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_INF("%s: n_kv: %d\n", __func__, (int)gguf_get_n_kv(ctx_gguf.get())); + LOG_INF("\n"); + } + + // modalities + { + get_bool(KEY_HAS_VISION_ENC, has_vision, false); + get_bool(KEY_HAS_AUDIO_ENC, has_audio, false); + + if (has_vision) { + LOG_INF("%s: has vision encoder\n", __func__); + } + if (has_audio) { + LOG_INF("%s: has audio encoder\n", __func__); + } + } + + // tensors + { + for (int i = 0; i < n_tensors; ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); + const size_t offset = gguf_get_tensor_offset(ctx_gguf.get(), i); + enum ggml_type type = gguf_get_tensor_type(ctx_gguf.get(), i); + ggml_tensor * cur = ggml_get_tensor(meta, name); + size_t tensor_size = ggml_nbytes(cur); + model_size += tensor_size; + LOG_DBG("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); + } + } + } + + void load_hparams(clip_model & model, clip_modality modality) { + auto & hparams = model.hparams; + std::string log_ffn_op; // for logging + + // sanity check + if (modality == CLIP_MODALITY_VISION) { + GGML_ASSERT(has_vision); + } else if (modality == CLIP_MODALITY_AUDIO) { + GGML_ASSERT(has_audio); + } + model.modality = modality; + + + // projector type + std::string proj_type; + { + // default key + get_string(KEY_PROJ_TYPE, proj_type, false); + + // for models with mixed modalities + if (proj_type.empty()) { + if (modality == CLIP_MODALITY_VISION) { + get_string(KEY_VISION_PROJ_TYPE, proj_type, false); + } else if (modality == CLIP_MODALITY_AUDIO) { + get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false); + } else { + GGML_ABORT("unknown modality"); + } + } + + model.proj_type = clip_projector_type_from_string(proj_type); + + if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) { + throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str())); + } + + // correct arch for multimodal models (legacy method) + if (model.proj_type == PROJECTOR_TYPE_QWEN25O) { + model.proj_type = modality == CLIP_MODALITY_VISION + ? PROJECTOR_TYPE_QWEN25VL + : PROJECTOR_TYPE_QWEN2A; + } + } + + const bool is_vision = model.modality == CLIP_MODALITY_VISION; + const bool is_audio = model.modality == CLIP_MODALITY_AUDIO; + + // other hparams + { + const char * prefix = is_vision ? "vision" : "audio"; + get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd); + get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head); + get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff); + get_u32(string_format(KEY_N_BLOCK, prefix), hparams.n_layer); + get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim); + get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps); + + if (is_vision) { + get_u32(KEY_IMAGE_SIZE, hparams.image_size); + get_u32(KEY_PATCH_SIZE, hparams.patch_size); + get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); + get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy + get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false); + if (hparams.minicpmv_query_num == 0) { + // Fallback to hardcoded values for legacy models + if (hparams.minicpmv_version == 3) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 4) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 5) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 6) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 100045) { + hparams.minicpmv_query_num = 64; + } else { + hparams.minicpmv_query_num = 96; + } + } + } else if (is_audio) { + get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins); + // some hparams are unused, but still need to set to avoid issues + hparams.image_size = 0; + hparams.patch_size = 1; + + } else { + GGML_ASSERT(false && "unknown modality"); + } + + // for pinpoints, we need to convert it into a list of resolution candidates + { + std::vector pinpoints; + get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false); + if (!pinpoints.empty()) { + for (size_t i = 0; i < pinpoints.size(); i += 2) { + hparams.image_res_candidates.push_back({ + pinpoints[i], + pinpoints[i+1], + }); + } + } + } + + // default warmup value + hparams.warmup_image_size = hparams.image_size; + + hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP + || model.proj_type == PROJECTOR_TYPE_MLP_NORM + || model.proj_type == PROJECTOR_TYPE_LDP + || model.proj_type == PROJECTOR_TYPE_LDPV2; + + { + bool use_gelu = false; + bool use_silu = false; + get_bool(KEY_USE_GELU, use_gelu, false); + get_bool(KEY_USE_SILU, use_silu, false); + if (use_gelu && use_silu) { + throw std::runtime_error(string_format("%s: both use_gelu and use_silu are set to true\n", __func__)); + } + if (use_gelu) { + hparams.ffn_op = FFN_GELU; + log_ffn_op = "gelu"; + } else if (use_silu) { + hparams.ffn_op = FFN_SILU; + log_ffn_op = "silu"; + } else { + hparams.ffn_op = FFN_GELU_QUICK; + log_ffn_op = "gelu_quick"; + } + } + + { + std::string mm_patch_merge_type; + get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false); + if (mm_patch_merge_type == "spatial_unpad") { + hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD; + } + } + + if (is_vision) { + int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN); + int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD); + GGML_ASSERT(idx_mean >= 0 && "image_mean not found"); + GGML_ASSERT(idx_std >= 0 && "image_std not found"); + const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean); + const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std); + for (int i = 0; i < 3; ++i) { + hparams.image_mean[i] = mean_data[i]; + hparams.image_std[i] = std_data[i]; + } + } + + // Load the vision feature layer indices if they are explicitly provided; + // if multiple vision feature layers are present, the values will be concatenated + // to form the final visual features. + // NOTE: gguf conversions should standardize the values of the vision feature layer to + // be non-negative, since we use -1 to mark values as unset here. + std::vector vision_feature_layer; + get_arr_int(KEY_FEATURE_LAYER, vision_feature_layer, false); + // convert std::vector to std::unordered_set + for (auto & layer : vision_feature_layer) { + hparams.vision_feature_layer.insert(layer); + } + + // model-specific params + switch (model.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + if (hparams.minicpmv_version == 0) { + hparams.minicpmv_version = 2; // default to 2 if not set + } + } break; + case PROJECTOR_TYPE_INTERNVL: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); + } break; + case PROJECTOR_TYPE_LFM2: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json + hparams.set_limit_image_tokens(64, 256); + } break; + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json + // TODO: verify the image_min_tokens + hparams.n_merge = 1; // the original pixtral does not use patch merging + hparams.rope_theta = 10000.0f; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_KIMIVL: + { + hparams.rope_theta = 10000.0f; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + // TODO: check kimivl preprocessor for exact values + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_KIMIK25: + { + hparams.rope_theta = 10000.0f; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + + int min_pixels = 0, max_pixels = 0; + get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false); + get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false); + if (min_pixels > 0 && max_pixels > 0) { + hparams.image_min_pixels = min_pixels; + hparams.image_max_pixels = max_pixels; + hparams.warmup_image_size = static_cast(std::sqrt(max_pixels)); + } else { + hparams.set_limit_image_tokens(2, 4096); + } + } break; + case PROJECTOR_TYPE_GEMMA3: + { + // default value (used by all model sizes in gemma 3 family) + // number of patches for each **side** is reduced by a factor of 4 + hparams.n_merge = 4; + // test model (tinygemma3) has a different value, we optionally read it + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + } break; + + case PROJECTOR_TYPE_GEMMA3NV: + { + // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16) + // Similar configuration to Gemma3 + hparams.n_merge = 1; // MobileNetV5 handles resizing internally + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + { + hparams.n_merge = 2; // default value for Qwen 2 and 2.5 + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it + // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json + hparams.set_limit_image_tokens(8, 4096); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup + const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size; + if (hparams.image_min_pixels < warn_min_pixels) { + LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__); + LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__); + LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__); + } + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + hparams.n_merge = 2; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + std::vector wa_layer_indexes_vec; + get_arr_int(KEY_WIN_ATTN_LAYER_INDEXES, wa_layer_indexes_vec, true); + for (auto & layer : wa_layer_indexes_vec) { + hparams.wa_layer_indexes.insert(layer); + } + // support max_height * max_width = 8000 * 8000. 8000/16/2 = 250 image tokens + hparams.set_limit_image_tokens(1, 62500); + hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_GLM4V: + { + hparams.rope_theta = 10000.0f; + hparams.n_merge = 2; // default value for GLM4-V + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(8, 4096); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup + } break; + case PROJECTOR_TYPE_LLAMA4: + { + hparams.rope_theta = 10000.0f; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); + set_llava_uhd_res_candidates(model, 3); + } break; + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX || + model.proj_type == PROJECTOR_TYPE_VOXTRAL || + model.proj_type == PROJECTOR_TYPE_GLMA; + get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack); + hparams.ffn_op = FFN_GELU_ERF; + log_ffn_op = "gelu_erf"; // temporary solution for logging + + // audio preprocessing params + hparams.audio_chunk_len = 30; // in seconds + hparams.audio_sample_rate = 16000; + hparams.audio_n_fft = 400; + hparams.audio_window_len = 400; + hparams.audio_hop_len = 160; + } break; + case PROJECTOR_TYPE_LFM2A: + { + // audio preprocessing params + hparams.audio_chunk_len = 1; // in seconds + hparams.audio_sample_rate = 16000; + hparams.audio_n_fft = 512; + hparams.audio_window_len = 400; + hparams.audio_hop_len = 160; + } break; + default: + break; + } + + // sanity check + { + if (hparams.image_max_pixels < hparams.image_min_pixels) { + throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels)); + } + } + + LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); + LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd); + LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head); + LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff); + LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer); + LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str()); + LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim); + if (is_vision) { + LOG_INF("\n--- vision hparams ---\n"); + LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size); + LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size); + LOG_INF("%s: has_llava_proj: %d\n", __func__, hparams.has_llava_projector); + LOG_INF("%s: minicpmv_version: %d\n", __func__, hparams.minicpmv_version); + LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge); + LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); + if (!hparams.wa_layer_indexes.empty()) { + LOG_INF("%s: wa_layer_indexes: ", __func__); + for (auto & layer : hparams.wa_layer_indexes) { + LOG_INF("%d ", layer); + } + LOG_INF("\n"); + } + if (hparams.image_min_pixels > 0) { + LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : ""); + } + if (hparams.image_max_pixels > 0) { + LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : ""); + } + } else if (is_audio) { + LOG_INF("\n--- audio hparams ---\n"); + LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins); + LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor); + LOG_INF("%s: audio_chunk_len: %d\n", __func__, hparams.audio_chunk_len); + LOG_INF("%s: audio_sample_rate: %d\n", __func__, hparams.audio_sample_rate); + LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft); + LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len); + LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len); + } + LOG_INF("\n"); + LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); + } + } + + void load_tensors(clip_ctx & ctx_clip) { + auto & model = ctx_clip.model; + auto & hparams = model.hparams; + std::map tensor_offset; + std::vector tensors_to_load; + + // TODO @ngxson : support both audio and video in the future + const char * prefix = model.modality == CLIP_MODALITY_AUDIO ? "a" : "v"; + + // get offsets + for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) { + const char * name = gguf_get_tensor_name(ctx_gguf.get(), i); + tensor_offset[name] = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), i); + } + + // create data context + struct ggml_init_params params = { + /*.mem_size =*/ static_cast(gguf_get_n_tensors(ctx_gguf.get()) + 1) * ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_clip.ctx_data.reset(ggml_init(params)); + if (!ctx_clip.ctx_data) { + throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__)); + } + + // helper function + auto get_tensor = [&](const std::string & name, bool required = true) { + ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str()); + if (!cur && required) { + throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str())); + } + if (cur) { + tensors_to_load.push_back(cur); + // add tensors to context + ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur); + ggml_set_name(data_tensor, cur->name); + cur = data_tensor; + } + return cur; + }; + + model.class_embedding = get_tensor(TN_CLASS_EMBD, false); + + model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false); + model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false); + + model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false); + model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false); + + model.patch_bias = get_tensor(TN_PATCH_BIAS, false); + model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false); + model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false); + + model.norm_embd_w = get_tensor(string_format(TN_NORM_EMBD, "weight"), false); + model.norm_embd_b = get_tensor(string_format(TN_NORM_EMBD, "bias"), false); + + model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); + + if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) { + hparams.n_layer = 0; // gemma3n does not use normal layer structure + } + + // layers + model.layers.resize(hparams.n_layer); + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false); + layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false); + layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false); + layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight")); + layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false); + layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false); + layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false); + layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false); + layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false); + layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias + layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias + + layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false); + layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false); + layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false); + layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false); + layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false); + layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false); + layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false); + + // ffn + layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight")); + layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false); + layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false); + layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false); + layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight")); + layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false); + + + // qwen3vl deepstack layer + layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false); + layer.deepstack_norm_b = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "bias"), false); + layer.deepstack_fc1_w = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "weight"), false); + layer.deepstack_fc1_b = get_tensor(string_format(TN_DEEPSTACK_FC1, il, "bias"), false); + layer.deepstack_fc2_w = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "weight"), false); + layer.deepstack_fc2_b = get_tensor(string_format(TN_DEEPSTACK_FC2, il, "bias"), false); + if (layer.has_deepstack()) { + model.n_deepstack_layers++; + } + + // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here + // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check! + bool is_ffn_swapped = ( + // only old models need this fix + model.proj_type == PROJECTOR_TYPE_MLP + || model.proj_type == PROJECTOR_TYPE_MLP_NORM + || model.proj_type == PROJECTOR_TYPE_LDP + || model.proj_type == PROJECTOR_TYPE_LDPV2 + || model.proj_type == PROJECTOR_TYPE_QWEN2VL + || model.proj_type == PROJECTOR_TYPE_QWEN25VL + || model.proj_type == PROJECTOR_TYPE_GLM_EDGE + || model.proj_type == PROJECTOR_TYPE_GEMMA3 + || model.proj_type == PROJECTOR_TYPE_IDEFICS3 + || model.proj_type == PROJECTOR_TYPE_MINICPMV + ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd; + if (is_ffn_swapped) { + // swap up and down weights + ggml_tensor * tmp = layer.ff_up_w; + layer.ff_up_w = layer.ff_down_w; + layer.ff_down_w = tmp; + // swap up and down biases + tmp = layer.ff_up_b; + layer.ff_up_b = layer.ff_down_b; + layer.ff_down_b = tmp; + if (il == 0) { + LOG_WRN("%s: ffn up/down are swapped\n", __func__); + } + } + } + + + switch (model.proj_type) { + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + { + // LLaVA projection + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"), false); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false); + // Yi-type llava + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"), false); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + // missing in Yi-type llava + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"), false); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + // Yi-type llava + model.mm_3_w = get_tensor(string_format(TN_LLAVA_PROJ, 3, "weight"), false); + model.mm_3_b = get_tensor(string_format(TN_LLAVA_PROJ, 3, "bias"), false); + model.mm_4_w = get_tensor(string_format(TN_LLAVA_PROJ, 4, "weight"), false); + model.mm_4_b = get_tensor(string_format(TN_LLAVA_PROJ, 4, "bias"), false); + if (model.mm_3_w) { + // TODO: this is a hack to support Yi-type llava + model.proj_type = PROJECTOR_TYPE_MLP_NORM; + } + model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false); + } break; + case PROJECTOR_TYPE_LDP: + { + // MobileVLM projection + model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + model.mm_model_mlp_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias")); + model.mm_model_mlp_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); + model.mm_model_mlp_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); + model.mm_model_block_1_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight")); + model.mm_model_block_1_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight")); + model.mm_model_block_1_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias")); + model.mm_model_block_1_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight")); + model.mm_model_block_1_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias")); + model.mm_model_block_1_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight")); + model.mm_model_block_1_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias")); + model.mm_model_block_1_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight")); + model.mm_model_block_1_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight")); + model.mm_model_block_1_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias")); + model.mm_model_block_2_block_0_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight")); + model.mm_model_block_2_block_0_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight")); + model.mm_model_block_2_block_0_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias")); + model.mm_model_block_2_block_1_fc1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight")); + model.mm_model_block_2_block_1_fc1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias")); + model.mm_model_block_2_block_1_fc2_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight")); + model.mm_model_block_2_block_1_fc2_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias")); + model.mm_model_block_2_block_2_0_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); + model.mm_model_block_2_block_2_1_w = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); + model.mm_model_block_2_block_2_1_b = get_tensor(string_format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); + } break; + case PROJECTOR_TYPE_LDPV2: + { + // MobilVLM_V2 projection + model.mm_model_mlp_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); + model.mm_model_mlp_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias")); + model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); + model.mm_model_mlp_2_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "bias")); + model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight")); + model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias")); + } break; + case PROJECTOR_TYPE_MINICPMV: + { + // model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); + model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K); + model.mm_model_query = get_tensor(TN_MINICPMV_QUERY); + model.mm_model_proj = get_tensor(TN_MINICPMV_PROJ); + model.mm_model_kv_proj = get_tensor(TN_MINICPMV_KV_PROJ); + model.mm_model_attn_q_w = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "weight")); + model.mm_model_attn_k_w = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "weight")); + model.mm_model_attn_v_w = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "weight")); + model.mm_model_attn_q_b = get_tensor(string_format(TN_MINICPMV_ATTN, "q", "bias")); + model.mm_model_attn_k_b = get_tensor(string_format(TN_MINICPMV_ATTN, "k", "bias")); + model.mm_model_attn_v_b = get_tensor(string_format(TN_MINICPMV_ATTN, "v", "bias")); + model.mm_model_attn_o_w = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "weight")); + model.mm_model_attn_o_b = get_tensor(string_format(TN_MINICPMV_ATTN, "out", "bias")); + model.mm_model_ln_q_w = get_tensor(string_format(TN_MINICPMV_LN, "q", "weight")); + model.mm_model_ln_q_b = get_tensor(string_format(TN_MINICPMV_LN, "q", "bias")); + model.mm_model_ln_kv_w = get_tensor(string_format(TN_MINICPMV_LN, "kv", "weight")); + model.mm_model_ln_kv_b = get_tensor(string_format(TN_MINICPMV_LN, "kv", "bias")); + model.mm_model_ln_post_w = get_tensor(string_format(TN_MINICPMV_LN, "post", "weight")); + model.mm_model_ln_post_b = get_tensor(string_format(TN_MINICPMV_LN, "post", "bias")); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + model.mm_model_adapter_conv_w = get_tensor(string_format(TN_GLM_ADAPER_CONV, "weight")); + model.mm_model_adapter_conv_b = get_tensor(string_format(TN_GLM_ADAPER_CONV, "bias")); + model.mm_model_mlp_0_w = get_tensor(string_format(TN_GLM_ADAPTER_LINEAR, "weight")); + model.mm_model_ln_q_w = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "weight")); + model.mm_model_ln_q_b = get_tensor(string_format(TN_GLM_ADAPTER_NORM_1, "bias")); + model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight")); + model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight")); + model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight")); + model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI)); + model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI)); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_QWEN3VL: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm) + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); // merger.mlp.0 + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); // merger.mlp.2 + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_GLM4V: + { + model.projection = get_tensor(TN_MM_PROJECTOR); + model.mm_ffn_up_w = get_tensor(string_format(TN_MM_UP, "weight")); + model.mm_ffn_up_b = get_tensor(string_format(TN_MM_UP, "bias"), false); + model.mm_ffn_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_ffn_gate_b = get_tensor(string_format(TN_MM_GATE, "bias"), false); + model.mm_ffn_down_w = get_tensor(string_format(TN_MM_DOWN, "weight")); + model.mm_ffn_down_b = get_tensor(string_format(TN_MM_DOWN, "bias"), false); + model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight")); + model.mm_post_norm_b = get_tensor(string_format(TN_MM_POST_NORM, "bias"), false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight")); + model.mm_patch_merger_b = get_tensor(string_format(TN_MM_PATCH_MERGER, "bias")); + } break; + case PROJECTOR_TYPE_GEMMA3: + { + model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); + } break; + case PROJECTOR_TYPE_GEMMA3NV: + { + model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false); + model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false); + model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false); + + model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false); + model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded + model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false); + model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false); + + model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false); + + // Dynamically load blocks stage by stage + for (int stage = 0; stage < 4; ++stage) { + int blocks_found_in_stage = 0; + + for (int blk_idx = 0; ; ++blk_idx) { + bool found_block = false; + mobilenetv5_block block; + + // 1. Check for Edge Residual (S0) + block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false); + if (block.s0_conv_exp_w) { + found_block = true; + block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false); + block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false); + block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false); + } + // 2. Check for UIR (Universal Inverted Residual) + else { + // Check for dw_start OR pw_exp (some UIR blocks skip dw_start) + block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false); + block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false); + + if (block.dw_start_w || block.pw_exp_w) { + found_block = true; + if (block.dw_start_w) { + block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false); + } + if (block.pw_exp_w) { + block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false); + } + block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false); + if (block.dw_mid_w) { + block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false); + } + block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false); + if (block.pw_proj_w) { + block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false); + } + block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false); + } + } + + // 3. Check for Attention (MQA) + // Even if UIR/Edge check failed, this might be a pure attention block + ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false); + if (attn_q_check) { + found_block = true; + block.attn_q_w = attn_q_check; + block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false); + block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false); + block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false); + block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false); + block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false); + block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false); + block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false); + block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false); + // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check + if (!block.layer_scale_w) { + block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false); + } + } + + if (found_block) { + model.mobilenet_blocks.push_back(block); + blocks_found_in_stage++; + } else { + // End of blocks for this stage + break; + } + } + + // Track where this stage ends in the flat vector + if (blocks_found_in_stage > 0) { + model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1); + LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1); + } + } + model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); + model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); + } break; + case PROJECTOR_TYPE_IDEFICS3: + { + model.projection = get_tensor(TN_MM_PROJECTOR); + } break; + case PROJECTOR_TYPE_LFM2: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B, false); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_KIMIVL: + case PROJECTOR_TYPE_KIMIK25: + { + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); + model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + // [IMG_BREAK] token embedding + model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK); + // for mistral small 3.1 + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); + } break; + case PROJECTOR_TYPE_LIGHTONOCR: + { + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false); + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); + model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); + } break; + case PROJECTOR_TYPE_ULTRAVOX: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); + model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight")); + } break; + case PROJECTOR_TYPE_QWEN2A: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_fc_w = get_tensor(string_format(TN_MM_AUDIO_FC, "weight")); + model.mm_fc_b = get_tensor(string_format(TN_MM_AUDIO_FC, "bias")); + } break; + case PROJECTOR_TYPE_VOXTRAL: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + } break; + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); + } break; + case PROJECTOR_TYPE_INTERNVL: + { + model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "bias")); + model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); + model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); + } break; + case PROJECTOR_TYPE_GLMA: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); + model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); + model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias")); + model.mm_boi = get_tensor(string_format(TN_TOK_BOI)); + model.mm_eoi = get_tensor(string_format(TN_TOK_EOI)); + } break; + case PROJECTOR_TYPE_LLAMA4: + { + model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); + model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); + } break; + case PROJECTOR_TYPE_COGVLM: + { + model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight")); + model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias")); + model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); + model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight")); + model.mm_boi = get_tensor(TN_TOK_BOI); + model.mm_eoi = get_tensor(TN_TOK_EOI); + } break; + case PROJECTOR_TYPE_JANUS_PRO: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); + } break; + case PROJECTOR_TYPE_LFM2A: + { + for (int i : {0, 2, 3, 5, 6}) { + model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight")); + model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias")); + } + model.pre_encode_out_w = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight")); + model.pre_encode_out_b = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias")); + + model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight")); + model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias")); + + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + + layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight")); + layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias")); + layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight")); + layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias")); + layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight")); + layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias")); + layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight")); + layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias")); + + layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il)); + layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il)); + + layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight")); + layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias")); + + layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight")); + + layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight")); + layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias")); + layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight")); + layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias")); + layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight")); + layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias")); + layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight")); + layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); + } + } break; + default: + GGML_ASSERT(false && "unknown projector type"); + } + + // load data + { + std::vector read_buf; + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); + } + + // alloc memory and offload data + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); + ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); + ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + for (auto & t : tensors_to_load) { + ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); + const size_t offset = tensor_offset[t->name]; + fin.seekg(offset, std::ios::beg); + if (!fin) { + throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); + } + size_t num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + } + fin.close(); + + LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); + } + } + + struct support_info_op { + ggml_tensor * op; + + // true if the op runs on the accelerated ctx_clip.backend + bool is_accel = true; + }; + + struct support_info_graph { + // whether the clip_ctx.backend supports flash attention + bool fattn = true; + ggml_tensor * fattn_op = nullptr; // for debugging + + std::vector ops; + }; + + static void warmup(clip_ctx & ctx_clip) { + // create a fake batch + const auto & hparams = ctx_clip.model.hparams; + clip_image_f32_batch batch; + clip_image_f32_ptr img(clip_image_f32_init()); + if (ctx_clip.model.modality == CLIP_MODALITY_VISION) { + img->nx = hparams.warmup_image_size; + img->ny = hparams.warmup_image_size; + LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny); + } else { + img->nx = hparams.warmup_audio_size; + img->ny = hparams.n_mel_bins; + LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx); + } + batch.entries.push_back(std::move(img)); + warmup(ctx_clip, batch); + } + + static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { + support_info_graph info; + + if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) { + // try to enable flash attention to see if it's supported + ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED; + info = alloc_compute_meta(ctx_clip, batch); + if (!info.fattn && info.fattn_op) { + auto op = info.fattn_op; + LOG_WRN("%s: *****************************************************************\n", __func__); + LOG_WRN("%s: WARNING: flash attention not supported by %s, memory usage will increase\n", __func__, ggml_backend_name(ctx_clip.backend)); + LOG_WRN("%s: op params: \n", __func__); + static auto print_shape = [](const char * fn, const char * name, ggml_tensor * t) { + LOG_WRN("%s: %s: type = %s, ne = [%d %d %d %d], nb = [%d %d %d %d]\n", fn, + name, ggml_type_name(t->type), + t->ne[0], t->ne[1], t->ne[2], t->ne[3], + t->nb[0], t->nb[1], t->nb[2], t->nb[3]); + }; + print_shape(__func__, " dst", op); + print_shape(__func__, "src0", op->src[0]); + print_shape(__func__, "src1", op->src[1]); + print_shape(__func__, "src2", op->src[2]); + LOG_WRN("%s: please report this on github as an issue\n", __func__); + LOG_WRN("%s: *****************************************************************\n", __func__); + ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED; + alloc_compute_meta(ctx_clip, batch); + } + } else { + info = alloc_compute_meta(ctx_clip, batch); + if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__); + } + } + + ctx_clip.is_allocated = true; // mark buffers as allocated + + LOG_INF("%s: flash attention is %s\n", __func__, + (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled"); + + // print ops that are not supported by the GPU backend (if there is one) + if (ctx_clip.backend && ctx_clip.backend != ctx_clip.backend_cpu) { + std::vector unsupported_ops; + for (const auto & op : info.ops) { + if (!op.is_accel) { + unsupported_ops.push_back(op); + } + } + if (!unsupported_ops.empty()) { + LOG_WRN("%s: *****************************************************************\n", __func__); + LOG_WRN("%s: WARNING: the CLIP graph uses unsupported operators by the backend\n", __func__); + LOG_WRN("%s: the performance will be suboptimal \n", __func__); + LOG_WRN("%s: list of unsupported ops (backend=%s):\n", __func__, ggml_backend_name(ctx_clip.backend)); + for (const auto & op : unsupported_ops) { + LOG_WRN("%s: %16s: type = %s, ne = [%d %d %d %d]\n", __func__, + ggml_op_name(op.op->op), + ggml_type_name(op.op->type), + op.op->ne[0], op.op->ne[1], op.op->ne[2], op.op->ne[3]); + } + LOG_WRN("%s: flash attention is %s\n", __func__, + (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled"); + LOG_WRN("%s: please report this on github as an issue\n", __func__); + LOG_WRN("%s: ref: https://github.com/ggml-org/llama.cpp/pull/16837#issuecomment-3461676118\n", __func__); + LOG_WRN("%s: *****************************************************************\n", __func__); + } + } + } + + static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { + ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + + ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch); + ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); + + for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) { + ggml_backend_t backend = ctx_clip.backend_ptrs[i]; + ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(ctx_clip.sched.get(), backend); + if (size > 1) { + LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } + + const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get()); + const int n_nodes = ggml_graph_n_nodes(gf); + + LOG_INF("%s: graph splits = %d, nodes = %d\n", __func__, n_splits, n_nodes); + + support_info_graph res { + /*.fattn = */ true, + /*.fattn_op = */ nullptr, + /*.ops = */ {}, + }; + + // check op support + for (int i = 0; i < ggml_graph_n_nodes(gf); i++) { + ggml_tensor * node = ggml_graph_node(gf, i); + res.ops.push_back({node, true}); + if (!ggml_backend_supports_op(ctx_clip.backend, node)) { + res.ops.back().is_accel = false; + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + res.fattn = false; + res.fattn_op = node; + } + } + } + + return res; + } + + void get_bool(const std::string & key, bool & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = gguf_get_val_bool(ctx_gguf.get(), i); + } + + void get_i32(const std::string & key, int & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = gguf_get_val_i32(ctx_gguf.get(), i); + } + + void get_u32(const std::string & key, int & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = gguf_get_val_u32(ctx_gguf.get(), i); + } + + void get_f32(const std::string & key, float & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = gguf_get_val_f32(ctx_gguf.get(), i); + } + + void get_string(const std::string & key, std::string & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + output = std::string(gguf_get_val_str(ctx_gguf.get(), i)); + } + + void get_arr_int(const std::string & key, std::vector & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + int n = gguf_get_arr_n(ctx_gguf.get(), i); + output.resize(n); + const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i); + for (int i = 0; i < n; ++i) { + output[i] = values[i]; + } + } + + static void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) { + auto & hparams = model.hparams; + for (int x = 1; x <= max_patches_per_side; x++) { + for (int y = 1; y <= max_patches_per_side; y++) { + if (x == 1 && y == 1) { + continue; // skip the first point + } + hparams.image_res_candidates.push_back(clip_image_size{ + x*hparams.image_size, + y*hparams.image_size, + }); + } + } + } +}; + +struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) { + clip_ctx * ctx_vision = nullptr; + clip_ctx * ctx_audio = nullptr; + + try { + clip_model_loader loader(fname); + bool skip_audio = false; + + if (loader.has_vision) { + ctx_vision = new clip_ctx(ctx_params); + loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION); + loader.load_tensors(*ctx_vision); + if (ctx_params.warmup) { + loader.warmup(*ctx_vision); + } + + // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors + // we can remove this check when we implement audio support for Gemma 3N + skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV; + + // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f); + } + + if (loader.has_audio && !skip_audio) { + ctx_audio = new clip_ctx(ctx_params); + loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO); + loader.load_tensors(*ctx_audio); + if (ctx_params.warmup) { + loader.warmup(*ctx_audio); + } + } + + } catch (const std::exception & e) { + LOG_ERR("%s: failed to load model '%s': %s\n", __func__, fname, e.what()); + + delete ctx_vision; + delete ctx_audio; + + return {nullptr, nullptr}; + } + + return {ctx_vision, ctx_audio}; +} + +struct clip_image_size * clip_image_size_init() { + struct clip_image_size * load_image_size = new struct clip_image_size(); + load_image_size->width = 448; + load_image_size->height = 448; + return load_image_size; +} + +struct clip_image_u8 * clip_image_u8_init() { + return new clip_image_u8(); +} + +struct clip_image_f32 * clip_image_f32_init() { + return new clip_image_f32(); +} + +struct clip_image_f32_batch * clip_image_f32_batch_init() { + return new clip_image_f32_batch(); +} + +unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) { + if (nx) *nx = img->nx; + if (ny) *ny = img->ny; + return img->buf.data(); +} + +void clip_image_size_free(struct clip_image_size * load_image_size) { + if (load_image_size == nullptr) { + return; + } + delete load_image_size; +} +void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } +void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; } +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; } + +size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) { + return batch->entries.size(); +} + +size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return 0; + } + return batch->entries[idx]->nx; +} + +size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return 0; + } + return batch->entries[idx]->ny; +} + +clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { + if (idx < 0 || idx >= (int)batch->entries.size()) { + LOG_ERR("%s: invalid index %d\n", __func__, idx); + return nullptr; + } + return batch->entries[idx].get(); +} + +void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { + img->nx = nx; + img->ny = ny; + img->buf.resize(3 * nx * ny); + memcpy(img->buf.data(), rgb_pixels, img->buf.size()); +} + +// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not +static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { + dst.nx = src.nx; + dst.ny = src.ny; + dst.buf.resize(src.buf.size()); + + // TODO @ngxson : seems like this could be done more efficiently on cgraph + for (size_t i = 0; i < src.buf.size(); ++i) { + int c = i % 3; // rgb + dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; + } +} + +// set of tools to manupulate images +// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv +struct img_tool { + enum resize_algo { + RESIZE_ALGO_BILINEAR, + RESIZE_ALGO_BICUBIC, + // RESIZE_ALGO_LANCZOS, // TODO + }; + + static void resize( + const clip_image_u8 & src, + clip_image_u8 & dst, + const clip_image_size & target_resolution, + resize_algo algo, + bool add_padding = true, // TODO: define the behavior for add_padding = false + std::array pad_color = {0, 0, 0}) { + dst.nx = target_resolution.width; + dst.ny = target_resolution.height; + dst.buf.resize(3 * dst.nx * dst.ny); + + if (dst.nx == src.nx && dst.ny == src.ny) { + // no resize needed, simple copy + dst.buf = src.buf; + return; + } + + if (!add_padding) { + // direct resize + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, dst, target_resolution.width, target_resolution.height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, dst, target_resolution.width, target_resolution.height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + } else { + // resize with padding + clip_image_u8 resized_image; + float scale_w = static_cast(target_resolution.width) / src.nx; + float scale_h = static_cast(target_resolution.height) / src.ny; + float scale = std::min(scale_w, scale_h); + int new_width = std::min(static_cast(std::ceil(src.nx * scale)), target_resolution.width); + int new_height = std::min(static_cast(std::ceil(src.ny * scale)), target_resolution.height); + + switch (algo) { + case RESIZE_ALGO_BILINEAR: + resize_bilinear(src, resized_image, new_width, new_height); + break; + case RESIZE_ALGO_BICUBIC: + resize_bicubic(src, resized_image, new_width, new_height); + break; + default: + throw std::runtime_error("Unsupported resize algorithm"); + } + + // fill dst with pad_color + fill(dst, pad_color); + + int offset_x = (target_resolution.width - new_width) / 2; + int offset_y = (target_resolution.height - new_height) / 2; + + composite(dst, resized_image, offset_x, offset_y); + } + } + + static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { + dst.nx = w; + dst.ny = h; + dst.buf.resize(3 * w * h); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int src_idx = 3 * ((y + i)*image.nx + (x + j)); + int dst_idx = 3 * (i*w + j); + dst.buf[dst_idx] = image.buf[src_idx]; + dst.buf[dst_idx + 1] = image.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = image.buf[src_idx + 2]; + } + } + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will be aligned to the nearest multiple of align_size + // if H or W size is larger than longest_edge, it will be resized to longest_edge + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) { + GGML_ASSERT(align_size > 0); + if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) { + return {0, 0}; + } + + float scale = std::min(static_cast(longest_edge) / inp_size.width, + static_cast(longest_edge) / inp_size.height); + + float target_width_f = static_cast(inp_size.width) * scale; + float target_height_f = static_cast(inp_size.height) * scale; + + auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + int aligned_width = ceil_by_factor(target_width_f); + int aligned_height = ceil_by_factor(target_height_f); + + return {aligned_width, aligned_height}; + } + + // calculate the size of the **resized** image, while preserving the aspect ratio + // the calculated size will have min_pixels <= W*H <= max_pixels + // this is referred as "smart_resize" in transformers code + static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) { + GGML_ASSERT(align_size > 0); + const int width = inp_size.width; + const int height = inp_size.height; + + auto round_by_factor = [f = align_size](float x) { return static_cast(std::round(x / static_cast(f))) * f; }; + auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; + auto floor_by_factor = [f = align_size](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; + + // always align up first + int h_bar = std::max(align_size, round_by_factor(height)); + int w_bar = std::max(align_size, round_by_factor(width)); + + if (h_bar * w_bar > max_pixels) { + const auto beta = std::sqrt(static_cast(height * width) / max_pixels); + h_bar = std::max(align_size, floor_by_factor(height / beta)); + w_bar = std::max(align_size, floor_by_factor(width / beta)); + } else if (h_bar * w_bar < min_pixels) { + const auto beta = std::sqrt(static_cast(min_pixels) / (height * width)); + h_bar = ceil_by_factor(height * beta); + w_bar = ceil_by_factor(width * beta); + } + + return {w_bar, h_bar}; + } + + // draw src image into dst image at offset (offset_x, offset_y) + static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) { + for (int y = 0; y < src.ny; ++y) { + for (int x = 0; x < src.nx; ++x) { + int dx = x + offset_x; + int dy = y + offset_y; + // skip pixels that would be out of bounds in the destination + if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) { + continue; + } + size_t dst_idx = 3 * (static_cast(dy) * dst.nx + static_cast(dx)); + size_t src_idx = 3 * (static_cast(y) * src.nx + static_cast(x)); + dst.buf[dst_idx + 0] = src.buf[src_idx + 0]; + dst.buf[dst_idx + 1] = src.buf[src_idx + 1]; + dst.buf[dst_idx + 2] = src.buf[src_idx + 2]; + } + } + } + + // fill the image with a solid color + static void fill(clip_image_u8 & img, const std::array & color) { + for (size_t i = 0; i < img.buf.size(); i += 3) { + img.buf[i] = color[0]; + img.buf[i + 1] = color[1]; + img.buf[i + 2] = color[2]; + } + } + +private: + // Bilinear resize function + static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float x_ratio = static_cast(src.nx - 1) / target_width; + float y_ratio = static_cast(src.ny - 1) / target_height; + + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast(px); + int y_floor = static_cast(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; + + for (int c = 0; c < 3; c++) { + float top = lerp( + static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = lerp( + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); + } + } + } + } + + // Bicubic resize function + // part of image will be cropped if the aspect ratio is different + static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) { + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5] = {}; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; + } + + static inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); + } + + // Linear interpolation between two points + static inline float lerp(float s, float e, float t) { + return s + (e - s) * t; + } +}; + +/** + * implementation of LLaVA-UHD: + * - https://arxiv.org/pdf/2403.11703 + * - https://github.com/thunlp/LLaVA-UHD + * - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 + * + * overview: + * - an image always have a single overview (downscaled image) + * - an image can have 0 or multiple slices, depending on the image size + * - each slice can then be considered as a separate image + * + * for example: + * + * [overview] --> [slice 1] --> [slice 2] + * | | + * +--> [slice 3] --> [slice 4] + */ +struct llava_uhd { + struct slice_coordinates { + int x; + int y; + clip_image_size size; + }; + + struct slice_instructions { + clip_image_size overview_size; // size of downscaled image + clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) + clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices + std::vector slices; + + img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; + bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + std::array pad_color_overview = {0, 0, 0}; + + img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC; + bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + std::array pad_color_refined = {0, 0, 0}; + }; + + static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { + slice_instructions res; + const int patch_size = clip_get_patch_size(ctx); + const int slice_size = clip_get_image_size(ctx); + const int original_width = original_size.width; + const int original_height = original_size.height; + + const bool has_slices = original_size.width > slice_size || original_size.height > slice_size; + const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty(); + + if (!has_slices) { + // skip slicing logic + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = clip_image_size{0, 0}; + res.grid_size = clip_image_size{0, 0}; + + return res; + } + + if (has_pinpoints) { + // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) + auto refine_size = llava_uhd::select_best_resolution( + original_size, + ctx->model.hparams.image_res_candidates); + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = refine_size; + res.grid_size = clip_image_size{0, 0}; + res.padding_refined = true; + res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding + + LOG_DBG("%s: using pinpoints for slicing\n", __func__); + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n", + __func__, original_width, original_height, + res.overview_size.width, res.overview_size.height, + res.refined_size.width, res.refined_size.height); + + for (int y = 0; y < refine_size.height; y += slice_size) { + for (int x = 0; x < refine_size.width; x += slice_size) { + slice_coordinates slice; + slice.x = x; + slice.y = y; + slice.size.width = std::min(slice_size, refine_size.width - x); + slice.size.height = std::min(slice_size, refine_size.height - y); + res.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", + __func__, (int)res.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + + res.grid_size.height = refine_size.height / slice_size; + res.grid_size.width = refine_size.width / slice_size; + LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height); + + return res; + } + + // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) + + auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); + res.overview_size = best_size; + + { + const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it + const float log_ratio = log((float)original_width / original_height); + const float ratio = (float)original_width * original_height / (slice_size * slice_size); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); + auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); + res.grid_size = best_grid; + res.refined_size = refine_size; + + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", + __func__, original_width, original_height, + res.overview_size.width, res.overview_size.height, + res.refined_size.width, res.refined_size.height, + res.grid_size.width, res.grid_size.height); + + int width = refine_size.width; + int height = refine_size.height; + int grid_x = int(width / best_grid.width); + int grid_y = int(height / best_grid.height); + for (int patches_y = 0, ic = 0; + patches_y < refine_size.height && ic < best_grid.height; + patches_y += grid_y, ic += 1) { + for (int patches_x = 0, jc = 0; + patches_x < refine_size.width && jc < best_grid.width; + patches_x += grid_x, jc += 1) { + slice_coordinates slice; + slice.x = patches_x; + slice.y = patches_y; + slice.size.width = grid_x; + slice.size.height = grid_y; + res.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", + __func__, (int)res.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + } + + return res; + } + + static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { + std::vector output; + + // resize to overview size + clip_image_u8_ptr resized_img(clip_image_u8_init()); + img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview, + inst.padding_overview, inst.pad_color_overview); + output.push_back(std::move(resized_img)); + + if (inst.slices.empty()) { + // no slices, just return the resized image + return output; + } + + // resize to refined size + clip_image_u8_ptr refined_img(clip_image_u8_init()); + img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined, + inst.padding_refined, inst.pad_color_refined); + + // create slices + for (const auto & slice : inst.slices) { + int x = slice.x; + int y = slice.y; + int w = slice.size.width; + int h = slice.size.height; + + clip_image_u8_ptr img_slice(clip_image_u8_init()); + img_tool::crop(*refined_img, *img_slice, x, y, w, h); + output.push_back(std::move(img_slice)); + } + + return output; + } + +private: + static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + clip_image_size res; + res.width = ensure_divide(width, patch_size); + res.height = ensure_divide(height, patch_size); + return res; + } + + static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) { + float scale_width = static_cast(target_max.width) / orig.width; + float scale_height = static_cast(target_max.height) / orig.height; + float scale = std::min(scale_width, scale_height); + return clip_image_size{ + static_cast(orig.width * scale), + static_cast(orig.height * scale), + }; + } + + /** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * For example, when given a list of resolutions: + * - 100x100 + * - 200x100 + * - 100x200 + * - 200x200 + * + * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution). + * + * @param original_size The original size of the image + * @param possible_resolutions A list of possible resolutions + * @return The best fit resolution + */ + static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector & possible_resolutions) { + clip_image_size best_fit; + int min_wasted_area = std::numeric_limits::max(); + int max_effective_resolution = 0; + + for (const clip_image_size & candidate : possible_resolutions) { + auto target_size = resize_maintain_aspect_ratio(original_size, candidate); + int effective_resolution = std::min( + target_size.width * target_size.height, + original_size.width * original_size.height); + int wasted_area = (candidate.width * candidate.height) - effective_resolution; + + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) { + max_effective_resolution = effective_resolution; + min_wasted_area = wasted_area; + best_fit = candidate; + } + + LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution); + } + + return best_fit; + } + + static int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); + } + + static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.width; + int height = original_size.height; + int grid_x = grid.width; + int grid_y = grid.height; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + clip_image_size grid_size; + grid_size.width = refine_width / grid_x; + grid_size.height = refine_height / grid_y; + + auto best_grid_size = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale); + int best_grid_width = best_grid_size.width; + int best_grid_height = best_grid_size.height; + + clip_image_size refine_size; + refine_size.width = best_grid_width * grid_x; + refine_size.height = best_grid_height * grid_y; + return refine_size; + } + + static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.push_back(clip_image_size{m, split_grids_nums / m}); + } + ++m; + } + } + + clip_image_size best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; + } +}; + +// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout) +struct lfm2_vl_image_processor { + // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json + static constexpr int min_tiles = 2; + static constexpr int max_tiles = 10; + static constexpr float max_pixels_tolerance = 2.0f; + static constexpr int tile_size = 512; + + static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { + llava_uhd::slice_instructions inst; + const auto & params = ctx->model.hparams; + const int align_size = params.patch_size * params.n_merge; + + inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; + inst.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; + inst.overview_size = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels); + + // tile if either dimension exceeds tile_size with tolerance + const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance; + + if (!needs_tiling) { + inst.refined_size = clip_image_size{0, 0}; + inst.grid_size = clip_image_size{0, 0}; + return inst; + } + + const clip_image_size grid = get_grid_layout(original_size.height, original_size.width); + + inst.grid_size = grid; + inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height}; + + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", + __func__, + original_size.width, original_size.height, + inst.overview_size.width, inst.overview_size.height, + inst.refined_size.width, inst.refined_size.height, + grid.width, grid.height); + + for (int row = 0; row < grid.height; row++) { + for (int col = 0; col < grid.width; col++) { + llava_uhd::slice_coordinates slice; + slice.x = col * tile_size; + slice.y = row * tile_size; + slice.size = clip_image_size{tile_size, tile_size}; + inst.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n", + __func__, (int)inst.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + + return inst; + } + +private: + static clip_image_size find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, int height) { + float best_ratio_diff = std::numeric_limits::max(); + clip_image_size best_ratio = {1, 1}; + const float area = static_cast(width * height); + + for (const auto & ratio : target_ratios) { + const float target_aspect_ratio = static_cast(ratio.width) / ratio.height; + const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); + if (ratio_diff < best_ratio_diff) { + best_ratio_diff = ratio_diff; + best_ratio = ratio; + } else if (ratio_diff == best_ratio_diff) { + const float target_area = static_cast(tile_size * tile_size * ratio.width * ratio.height); + if (area > 0.5f * target_area) { + best_ratio = ratio; + } + } + } + return best_ratio; + } + + static std::vector get_target_ratios() { + std::vector ratios; + for (int n = min_tiles; n <= max_tiles; n++) { + for (int w = 1; w <= n; w++) { + for (int h = 1; h <= n; h++) { + if (w * h >= min_tiles && w * h <= max_tiles) { + bool found = false; + for (const auto & r : ratios) { + if (r.width == w && r.height == h) { + found = true; + break; + } + } + if (!found) { + ratios.push_back({w, h}); + } + } + } + } + } + std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) { + return a.width * a.height < b.width * b.height; + }); + return ratios; + } + + static clip_image_size get_grid_layout(int height, int width) { + const float aspect_ratio = static_cast(width) / height; + const auto ratios = get_target_ratios(); + return find_closest_aspect_ratio(aspect_ratio, ratios, width, height); + } +}; + +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector +// res_imgs memory is being allocated here, previous allocations will be freed if found +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { + clip_image_size original_size{img->nx, img->ny}; + auto & params = ctx->model.hparams; + + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_MINICPMV: + { + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; + + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + { + GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); + clip_image_u8 resized; + const clip_image_size new_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * 2, + params.image_min_pixels, + params.image_max_pixels); + img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); + // clip_image_save_to_bmp(resized, "preproc.bmp"); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + // clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); + // res_imgs->data[0] = *res; + res_imgs->entries.push_back(std::move(img_f32)); + } break; + case PROJECTOR_TYPE_YOUTUVL: + { + const int patch_size = params.patch_size; // typically 16 + const int merge_size = params.n_merge; // typically 2 + const int align_size = patch_size * merge_size; // 32 + + const int max_num_patches = params.image_max_pixels > 0 ? + params.image_max_pixels / (patch_size * patch_size) : 256; + + // Linear search for optimal scale to fit within max_num_patches + float scale = 1.0f; + int target_height = original_size.height; + int target_width = original_size.width; + + auto get_scaled_image_size = [align_size](float scale, int size) -> int { + float scaled_size = size * scale; + // Round up to nearest multiple of align_size + int aligned = static_cast(std::ceil(scaled_size / align_size)) * align_size; + // Ensure at least one patch + return std::max(align_size, aligned); + }; + + // Linear search with 0.02 step size + while (scale > 0.0f) { + target_height = get_scaled_image_size(scale, original_size.height); + target_width = get_scaled_image_size(scale, original_size.width); + + int num_patches_h = target_height / patch_size; + int num_patches_w = target_width / patch_size; + int num_patches = num_patches_h * num_patches_w; + + if (num_patches > max_num_patches) { + scale -= 0.02f; + } else { + break; + } + } + + clip_image_size new_size = {target_width, target_height}; + + // Resize the image + clip_image_u8 resized; + img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false); + + // Normalize to float32 + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std); + + // Add to results + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_IDEFICS3: + { + // The refined size has two steps: + // 1. Resize w/ aspect-ratio preserving such that the longer side is + // the preprocessor longest size + // 2. Resize w/out preserving aspect ratio such that both sides are + // multiples of image_size (always rounding up) + // + // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 + const clip_image_size refined_size = img_tool::calc_size_preserved_ratio( + original_size, params.image_size, params.image_longest_edge); + // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", + // __func__, original_size.width, original_size.height, + // refined_size.width, refined_size.height); + + llava_uhd::slice_instructions instructions; + instructions.overview_size = clip_image_size{params.image_size, params.image_size}; + instructions.refined_size = refined_size; + instructions.grid_size = clip_image_size{ + static_cast(std::ceil(static_cast(refined_size.width) / params.image_size)), + static_cast(std::ceil(static_cast(refined_size.height) / params.image_size)), + }; + for (int y = 0; y < refined_size.height; y += params.image_size) { + for (int x = 0; x < refined_size.width; x += params.image_size) { + // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); + instructions.slices.push_back(llava_uhd::slice_coordinates{ + /* x */x, + /* y */y, + /* size */clip_image_size{ + std::min(params.image_size, refined_size.width - x), + std::min(params.image_size, refined_size.height - y) + } + }); + } + } + auto imgs = llava_uhd::slice_image(img, instructions); + + // cast and normalize to f32 + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = instructions.grid_size.width; + res_imgs->grid_y = instructions.grid_size.height; + } break; + + case PROJECTOR_TYPE_GLM_EDGE: + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution + { + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + //clip_image_save_to_bmp(resized_image, "resized.bmp"); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_GEMMA3NV: + { + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_JANUS_PRO: + { + // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384 + const std::array pad_color = {127, 127, 127}; + clip_image_u8 resized_image; + int sz = params.image_size; + img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); + clip_image_u8 resized_image; + // the original pixtral model doesn't have n_merge + const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge; + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * cur_merge, + params.image_min_pixels, + params.image_max_pixels); + img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR); + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + } break; + + case PROJECTOR_TYPE_LLAMA4: + { + GGML_ASSERT(!params.image_res_candidates.empty()); + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; + + case PROJECTOR_TYPE_LFM2: + { + auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; + + case PROJECTOR_TYPE_KIMIVL: + { + GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * params.n_merge, + params.image_min_pixels, + params.image_max_pixels); + const std::array pad_color = {122, 116, 104}; + + clip_image_u8 resized_img; + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } break; + + case PROJECTOR_TYPE_KIMIK25: + { + GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); + const clip_image_size target_size = img_tool::calc_size_preserved_ratio( + original_size, + params.patch_size * params.n_merge, + params.image_min_pixels, + params.image_max_pixels); + const std::array pad_color = {0, 0, 0}; + + clip_image_u8 resized_img; + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } break; + + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm? + { + // TODO @ngxson : refactor the code below to avoid duplicated logic + + // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + + clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily + + // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing + if (params.image_res_candidates.empty()) { // pad_to_square + // for llava-1.5, we resize image to a square, and pad the shorter side with a background color + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + const int longer_side = std::max(img->nx, img->ny); + temp->nx = longer_side; + temp->ny = longer_side; + temp->buf.resize(3 * longer_side * longer_side); + + // background color in RGB from LLaVA (this is the mean rgb color * 255) + const std::array pad_color = {122, 116, 104}; + + // resize the image to the target_size + img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); + + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + + } else { + // "spatial_unpad" with "anyres" processing for llava-1.6 + auto const inst = llava_uhd::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + } + } break; + + default: + LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type()); + return false; + } + + return true; +} + +ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { + return ctx->model.image_newline; +} + +void clip_free(clip_ctx * ctx) { + if (ctx == nullptr) { + return; + } + delete ctx; +} + +// deprecated +size_t clip_embd_nbytes(const struct clip_ctx * ctx) { + const int32_t nx = ctx->model.hparams.image_size; + const int32_t ny = ctx->model.hparams.image_size; + return clip_embd_nbytes_by_img(ctx, nx, ny); +} + +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) { + clip_image_f32 img; + img.nx = img_w; + img.ny = img_h; + return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + +int32_t clip_get_image_size(const struct clip_ctx * ctx) { + return ctx->model.hparams.image_size; +} + +int32_t clip_get_patch_size(const struct clip_ctx * ctx) { + return ctx->model.hparams.patch_size; +} + +int32_t clip_get_hidden_size(const struct clip_ctx * ctx) { + return ctx->model.hparams.n_embd; +} + +const char * clip_patch_merge_type(const struct clip_ctx * ctx) { + return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat"; +} + +int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->model.hparams; + const int n_total = clip_n_output_tokens(ctx, img); + const auto & proj = ctx->proj_type(); + switch (proj) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_YOUTUVL: + return (img->nx / params.patch_size) / 2; + default: + break; + } + return n_total; +} + +int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->model.hparams; + const auto & proj = ctx->proj_type(); + switch (proj) { + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_YOUTUVL: + return (img->ny / params.patch_size) / 2; + default: + break; + } + return 1; +} + +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { + const auto & params = ctx->model.hparams; + + // for models with fixed size image, the input image is already pre-processed and resized to square + int patch_size = params.patch_size; + int n_patches = (img->nx / patch_size) * (img->ny / patch_size); + + projector_type proj = ctx->proj_type(); + + switch (proj) { + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_JANUS_PRO: + { + // do nothing + } break; + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_GLM_EDGE: + { + n_patches /= 4; + if (ctx->model.mm_boi) { + n_patches += 2; // for BOI and EOI token embeddings + } + } break; + case PROJECTOR_TYPE_MINICPMV: + { + // Use actual config value if available, otherwise fall back to hardcoded values + if (params.minicpmv_query_num > 0) { + n_patches = params.minicpmv_query_num; + } else { + // Fallback to hardcoded values for legacy models + if (params.minicpmv_version == 2) { + n_patches = 96; + } else if (params.minicpmv_version == 3) { + n_patches = 64; + } else if (params.minicpmv_version == 4) { + n_patches = 64; + } else if (params.minicpmv_version == 5) { + // MiniCPM-V 4.0 + n_patches = 64; + } else if (params.minicpmv_version == 6) { + // MiniCPM-V 4.5 + n_patches = 64; + } else if (params.minicpmv_version == 100045) { + // MiniCPM-o 4.5 + n_patches = 64; + } else { + GGML_ABORT("Unknown minicpmv version"); + } + } + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + case PROJECTOR_TYPE_YOUTUVL: + { + // dynamic size (2 conv, so double patch size) + int x_patch = img->nx / (params.patch_size * 2); + int y_patch = img->ny / (params.patch_size * 2); + n_patches = x_patch * y_patch; + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_LLAMA4: + { + // both X and Y are downscaled by the scale factor + int scale_factor = ctx->model.hparams.n_merge; + n_patches /= (scale_factor * scale_factor); + } break; + case PROJECTOR_TYPE_GEMMA3NV: + { + // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution + // regardless of input size (see architecture description) + n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size; + } break; + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: + case PROJECTOR_TYPE_KIMIK25: + { + // dynamic size + int out_patch_size = params.patch_size * ctx->model.hparams.n_merge; + int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size; + int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size; + n_patches = x_patch * y_patch; + } break; + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + { + // dynamic size + int n_merge = ctx->model.hparams.n_merge; + int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1); + if (ctx->model.token_embd_img_break) { + n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row + } else { + n_patches = n_patches_y * n_patches_x; + } + } break; + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + n_patches = img->nx; + + const int proj_stack_factor = ctx->model.hparams.proj_stack_factor; + if (ctx->model.audio_has_stack_frames()) { + GGML_ASSERT(proj_stack_factor > 0); + const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor); + n_patches = n_len / proj_stack_factor; + } + + // whisper downscales input token by half after conv1d + n_patches /= 2; + + if (ctx->model.audio_has_avgpool()) { + // divide by 2 because of nn.AvgPool1d(2, stride=2) + n_patches /= 2; + } + } break; + case PROJECTOR_TYPE_GLMA: + { + n_patches = img->nx; + // whisper downscales input token by half after conv1d + n_patches /= 2; + // reshape by merge_factor + n_patches /= ctx->model.hparams.proj_stack_factor; + // for BOI and EOI token embeddings + n_patches += 2; + } break; + case PROJECTOR_TYPE_COGVLM: + { + n_patches += 2; // for BOI and EOI token embeddings + } break; + case PROJECTOR_TYPE_LFM2A: + { + n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; + } break; + default: + GGML_ABORT("unsupported projector type"); + } + + return n_patches; +} + +bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { + clip_image_f32_batch imgs; + clip_image_f32_ptr img_copy(clip_image_f32_init()); + *img_copy = *img; + imgs.entries.push_back(std::move(img_copy)); + + return clip_image_batch_encode(ctx, n_threads, &imgs, vec); +} + +bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { + const clip_image_f32_batch & imgs = *imgs_c_ptr; + int batch_size = imgs.entries.size(); + + // TODO @ngxson : implement batch size > 1 as a loop + // we don't need true batching support because the cgraph will gonna be big anyway + if (batch_size != 1) { + return false; // only support batch size of 1 + } + + // if buffers are not allocated, we need to do a warmup run to allocate them + if (!ctx->is_allocated) { + clip_model_loader::warmup(*ctx, *imgs_c_ptr); + } + + // build the inference graph + ggml_backend_sched_reset(ctx->sched.get()); + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); + + // set inputs + const auto & model = ctx->model; + const auto & hparams = model.hparams; + + const int image_size_width = imgs.entries[0]->nx; + const int image_size_height = imgs.entries[0]->ny; + + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int n_pos = num_patches + (model.class_embedding ? 1 : 0); + const int pos_w = image_size_width / patch_size; + const int pos_h = image_size_height / patch_size; + + + auto get_inp_tensor = [&gf](const char * name) { + ggml_tensor * inp = ggml_graph_get_tensor(gf, name); + if (inp == nullptr) { + GGML_ABORT("Failed to get tensor %s", name); + } + if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ABORT("Tensor %s is not an input tensor", name); + } + return inp; + }; + + auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + // set input pixel values + if (!imgs.is_audio) { + size_t nelem = 0; + for (const auto & img : imgs.entries) { + nelem += img->nx * img->ny * 3; + } + std::vector inp_raw(nelem); + + // layout of data (note: the channel dim is unrolled to better visualize the layout): + // + // ┌──W──┐ + // │ H │ channel = R + // ├─────┤ │ + // │ H │ channel = G + // ├─────┤ │ + // │ H │ channel = B + // └─────┘ │ + // ──────┘ x B + + for (size_t i = 0; i < imgs.entries.size(); i++) { + const int nx = imgs.entries[i]->nx; + const int ny = imgs.entries[i]->ny; + const int n = nx * ny; + + for (int b = 0; b < batch_size; b++) { + float * batch_entry = inp_raw.data() + b * (3*n); + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + size_t base_src = 3*(y * nx + x); // idx of the first channel + size_t base_dst = y * nx + x; // idx of the first channel + batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; + batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; + batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; + } + } + } + } + set_input_f32("inp_raw", inp_raw); + + } else { + // audio input + GGML_ASSERT(imgs.entries.size() == 1); + const auto & mel_inp = imgs.entries[0]; + const int n_step = mel_inp->nx; + const int n_mel = mel_inp->ny; + std::vector inp_raw(n_step * n_mel); + std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float)); + set_input_f32("inp_raw", inp_raw); + } + + // set input per projector + switch (ctx->model.proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + std::vector positions(pos_h * pos_w); + int bucket_coords_h[1024]; + int bucket_coords_w[1024]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); + } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + set_input_i32("positions", positions); + + // inputs for resampler projector + // set the 2D positions (using float for sinusoidal embedding) + int n_patches_per_col = image_size_width / patch_size; + std::vector pos_data(n_pos); + // dimension H + for (int i = 0; i < n_pos; i++) { + pos_data[i] = static_cast(i / n_patches_per_col); + } + set_input_f32("pos_h", pos_data); + // dimension W + for (int i = 0; i < n_pos; i++) { + pos_data[i] = static_cast(i % n_patches_per_col); + } + set_input_f32("pos_w", pos_data); + // base frequency omega + const float base_freq = 10000.0f; + const int n_embd_proj = clip_n_mmproj_embd(ctx); + std::vector omega(n_embd_proj / 4); + for (int i = 0; i < n_embd_proj / 4; ++i) { + omega[i] = 1.0f / std::pow(base_freq, static_cast(i) / (n_embd_proj / 4)); + } + set_input_f32("omega", omega); + } break; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN3VL: + case PROJECTOR_TYPE_GLM4V: + { + const int merge_ratio = hparams.n_merge; + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + std::vector positions(n_pos * 4); + int ptr = 0; + for (int y = 0; y < ph; y += merge_ratio) { + for (int x = 0; x < pw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + positions[ ptr] = y + dy; + positions[ num_patches + ptr] = x + dx; + positions[2 * num_patches + ptr] = y + dy; + positions[3 * num_patches + ptr] = x + dx; + ptr++; + } + } + } + } + + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_YOUTUVL: + { + // pw * ph = number of tokens output by ViT after apply patch merger + // ipw * ipw = number of vision token been processed inside ViT + const bool use_window_attn = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? hparams.n_wa_pattern > 0 : !hparams.wa_layer_indexes.empty(); + const int merge_ratio = 2; + const int pw = image_size_width / patch_size / merge_ratio; + const int ph = image_size_height / patch_size / merge_ratio; + const int ipw = image_size_width / patch_size; + const int iph = image_size_height / patch_size; + + std::vector idx (ph * pw); + std::vector inv_idx(ph * pw); + + if (use_window_attn) { + const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112; + const int grid_window = attn_window_size / patch_size / merge_ratio; + int dst = 0; + // [num_vision_tokens, num_vision_tokens] attention mask tensor + std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); + int mask_row = 0; + + for (int y = 0; y < ph; y += grid_window) { + for (int x = 0; x < pw; x += grid_window) { + const int win_h = std::min(grid_window, ph - y); + const int win_w = std::min(grid_window, pw - x); + const int dst_0 = dst; + // group all tokens belong to the same window togather (to a continue range) + for (int dy = 0; dy < win_h; dy++) { + for (int dx = 0; dx < win_w; dx++) { + const int src = (y + dy) * pw + (x + dx); + GGML_ASSERT(src < (int)idx.size()); + GGML_ASSERT(dst < (int)inv_idx.size()); + idx [src] = dst; + inv_idx[dst] = src; + dst++; + } + } + + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { + int row_offset = mask_row * (ipw * iph); + std::fill( + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), + 0.0); + mask_row++; + } + } + } + + set_input_i32("window_idx", idx); + set_input_i32("inv_window_idx", inv_idx); + set_input_f32("window_mask", mask); + } else { + for (int i = 0; i < ph * pw; i++) { + idx[i] = i; + } + } + + const int mpow = merge_ratio * merge_ratio; + std::vector positions(n_pos * 4); + + int ptr = 0; + for (int y = 0; y < iph; y += merge_ratio) { + for (int x = 0; x < ipw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + auto remap = idx[ptr / mpow]; + remap = (remap * mpow) + (ptr % mpow); + + positions[ remap] = y + dy; + positions[ num_patches + remap] = x + dx; + positions[2 * num_patches + remap] = y + dy; + positions[3 * num_patches + remap] = x + dx; + ptr++; + } + } + } + } + + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_KIMIVL: + case PROJECTOR_TYPE_KIMIK25: + case PROJECTOR_TYPE_LIGHTONOCR: + { + // set the 2D positions + int n_patches_per_col = image_size_width / patch_size; + std::vector pos_data(n_pos); + // dimension H + for (int i = 0; i < n_pos; i++) { + pos_data[i] = i / n_patches_per_col; + } + set_input_i32("pos_h", pos_data); + // dimension W + for (int i = 0; i < n_pos; i++) { + pos_data[i] = i % n_patches_per_col; + } + set_input_i32("pos_w", pos_data); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { + // llava and other models + std::vector positions(n_pos); + for (int i = 0; i < n_pos; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + { + // llava and other models + std::vector positions(n_pos); + for (int i = 0; i < n_pos; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); + + // The patches vector is used to get rows to index into the embeds with; + // we should skip dim 0 only if we have CLS to avoid going out of bounds + // when retrieving the rows. + int patch_offset = model.class_embedding ? 1 : 0; + std::vector patches(num_patches); + for (int i = 0; i < num_patches; i++) { + patches[i] = i + patch_offset; + } + set_input_i32("patches", patches); + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA3NV: + case PROJECTOR_TYPE_IDEFICS3: + case PROJECTOR_TYPE_INTERNVL: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_COGVLM: + { + // do nothing + } break; + case PROJECTOR_TYPE_LLAMA4: + { + // set the 2D positions + int n_patches_per_col = image_size_width / patch_size; + std::vector pos_data(num_patches + 1, 0); // +1 for the [CLS] token + // last pos is always kept 0, it's for CLS + // dimension H + for (int i = 0; i < num_patches; i++) { + pos_data[i] = (i / n_patches_per_col) + 1; + } + set_input_i32("pos_h", pos_data); + // dimension W + for (int i = 0; i < num_patches; i++) { + pos_data[i] = (i % n_patches_per_col) + 1; + } + set_input_i32("pos_w", pos_data); + } break; + case PROJECTOR_TYPE_LFM2A: + { + GGML_ASSERT(imgs.entries.size() == 1); + const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get()); + + auto d_model = 512; + auto seq_len = n_frames * 2 - 1; + std::vector pos_emb(d_model*seq_len); + std::vector inv_freq(d_model / 2); + for (size_t i = 0; i < inv_freq.size(); ++i) { + inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i))); + } + for (int64_t pos = 0; pos < seq_len; ++pos) { + for (size_t i = 0; i < inv_freq.size(); ++i) { + const float ang = (n_frames - pos - 1) * inv_freq[i]; + pos_emb[pos*d_model + 2*i + 0] = sinf(ang); // even + pos_emb[pos*d_model + 2*i + 1] = cosf(ang); // odd + } + } + set_input_f32("pos_emb", pos_emb); + } break; + default: + GGML_ABORT("Unknown projector type"); + } + + // ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); + ggml_backend_dev_t dev = ggml_backend_get_device(ctx->backend_cpu); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + if (reg) { + auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (ggml_backend_set_n_threads_fn) { + ggml_backend_set_n_threads_fn(ctx->backend_cpu, n_threads); + } + } + + auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status); + return false; + } + + // the last node is the embedding tensor + ggml_tensor * embeddings = ggml_graph_node(gf, -1); + + // sanity check (only support batch size of 1 for now) + const int n_tokens_out = embeddings->ne[1]; + const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get()); + if (n_tokens_out != expected_n_tokens_out) { + LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out); + GGML_ABORT("Invalid number of output tokens"); + } + + // copy the embeddings to the location passed by the user + if (vec != nullptr) { + ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + } + + // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set + if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) { + const int64_t n_embd = embeddings->ne[0]; + const int64_t n_tokens = embeddings->ne[1]; + std::vector emb_data(n_embd * n_tokens); + ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings)); + + LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n"); + LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens); + + // Print first few values of first token + LOG_INF("Token 0 (first 16 values): "); + for (int i = 0; i < std::min((int64_t)16, n_embd); i++) { + LOG_INF("%.6f ", emb_data[i]); + } + LOG_INF("\n"); + + // Print last few values of first token + if (n_embd > 16) { + LOG_INF("Token 0 (last 16 values): "); + for (int64_t i = n_embd - 16; i < n_embd; i++) { + LOG_INF("%.6f ", emb_data[i]); + } + LOG_INF("\n"); + } + + // Compute and print statistics + float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0]; + for (size_t i = 0; i < emb_data.size(); i++) { + sum += emb_data[i]; + sum_sq += emb_data[i] * emb_data[i]; + min_val = std::min(min_val, emb_data[i]); + max_val = std::max(max_val, emb_data[i]); + } + float mean = sum / emb_data.size(); + float variance = (sum_sq / emb_data.size()) - (mean * mean); + LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n", + mean, sqrtf(variance), min_val, max_val, sum); + LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n"); + } + + return true; +} + +int clip_n_mmproj_embd(const struct clip_ctx * ctx) { + switch (ctx->model.proj_type) { + case PROJECTOR_TYPE_LDP: + return ctx->model.mm_model_block_1_block_2_1_b->ne[0]; + case PROJECTOR_TYPE_LDPV2: + return ctx->model.mm_model_peg_0_b->ne[0]; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_PIXTRAL: + case PROJECTOR_TYPE_LIGHTONOCR: + return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_MLP_NORM: + return ctx->model.mm_3_b->ne[0]; + case PROJECTOR_TYPE_MINICPMV: + return ctx->model.mm_model_proj->ne[0]; + case PROJECTOR_TYPE_GLM_EDGE: + return ctx->model.mm_model_mlp_3_w->ne[1]; + case PROJECTOR_TYPE_QWEN2VL: + case PROJECTOR_TYPE_QWEN25VL: + case PROJECTOR_TYPE_JANUS_PRO: + case PROJECTOR_TYPE_YOUTUVL: + return ctx->model.mm_1_b->ne[0]; + case PROJECTOR_TYPE_QWEN3VL: + // main path + deepstack paths + return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers); + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_GEMMA3NV: + return ctx->model.mm_input_proj_w->ne[0]; + case PROJECTOR_TYPE_IDEFICS3: + return ctx->model.projection->ne[1]; + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_INTERNVL: + return ctx->model.mm_3_w->ne[1]; + case PROJECTOR_TYPE_LLAMA4: + return ctx->model.mm_model_proj->ne[1]; + case PROJECTOR_TYPE_QWEN2A: + return ctx->model.mm_fc_w->ne[1]; + case PROJECTOR_TYPE_GLMA: + return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_KIMIVL: + case PROJECTOR_TYPE_KIMIK25: + return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_COGVLM: + return ctx->model.mm_4h_to_h_w->ne[1]; + case PROJECTOR_TYPE_LFM2A: + return ctx->model.position_embeddings->ne[0]; + case PROJECTOR_TYPE_GLM4V: + return ctx->model.mm_ffn_down_w->ne[1]; + default: + GGML_ABORT("Unknown projector type"); + } +} + +int clip_is_minicpmv(const struct clip_ctx * ctx) { + // TODO: remove this function + if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) { + return ctx->model.hparams.minicpmv_version; + } + return 0; +} + +bool clip_is_glm(const struct clip_ctx * ctx) { + // TODO: remove this function + return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE; +} + +bool clip_is_llava(const struct clip_ctx * ctx) { + return ctx->model.hparams.has_llava_projector; +} + +bool clip_has_vision_encoder(const struct clip_ctx * ctx) { + return ctx->model.modality == CLIP_MODALITY_VISION; +} + +bool clip_has_audio_encoder(const struct clip_ctx * ctx) { + return ctx->model.modality == CLIP_MODALITY_AUDIO; +} + +bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { + switch (ctx->proj_type()) { + case PROJECTOR_TYPE_ULTRAVOX: + case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + return true; + default: + return false; + } +} + +bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { + clip_image_f32 clip_img; + clip_img.buf.resize(h * w * 3); + for (int i = 0; i < h*w*3; i++) + { + clip_img.buf[i] = img[i]; + } + clip_img.nx = w; + clip_img.ny = h; + clip_image_encode(ctx, n_threads, &clip_img, vec); + return true; +} + +// +// API used internally with mtmd +// + +projector_type clip_get_projector_type(const struct clip_ctx * ctx) { + return ctx->proj_type(); +} + +void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) { + clip_image_f32 * audio = new clip_image_f32; + audio->nx = n_frames; + audio->ny = n_mel; + audio->buf.resize(n_frames * n_mel); + std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float)); + + batch->entries.push_back(clip_image_f32_ptr(audio)); + batch->is_audio = true; +} + +const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) { + return &ctx->model.hparams; +} + +// +// API for debugging +// +void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) { + clip_image_f32 img; + img.nx = w; + img.ny = h; + img.buf.resize(h * w * 3); + for (int i = 0; i < h * w * 3; i++) { + img.buf[i] = static_cast(fill_value); + } + clip_image_encode(ctx, 1, &img, nullptr); + GGML_ASSERT(img.buf.empty() && "expected, always stop here"); +} diff --git a/llama.cpp/tools/mtmd/clip.h b/llama.cpp/tools/mtmd/clip.h new file mode 100644 index 0000000..71b5848 --- /dev/null +++ b/llama.cpp/tools/mtmd/clip.h @@ -0,0 +1,121 @@ +#pragma once + +#include "ggml.h" +#include "mtmd.h" + +#include +#include + +// !!! Internal header, to be used by mtmd only !!! + +#define MTMD_INTERNAL_HEADER + +struct clip_ctx; + +struct clip_image_size { + int width; + int height; +}; + +struct clip_image_f32; +struct clip_image_u8_batch; +struct clip_image_f32_batch; + +enum clip_modality { + CLIP_MODALITY_VISION, + CLIP_MODALITY_AUDIO, +}; + +enum clip_flash_attn_type { + CLIP_FLASH_ATTN_TYPE_AUTO = -1, + CLIP_FLASH_ATTN_TYPE_DISABLED = 0, + CLIP_FLASH_ATTN_TYPE_ENABLED = 1, +}; + +struct clip_context_params { + bool use_gpu; + enum clip_flash_attn_type flash_attn_type; + int image_min_tokens; + int image_max_tokens; + bool warmup; + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; +}; + +struct clip_init_result { + struct clip_ctx * ctx_v; // vision context + struct clip_ctx * ctx_a; // audio context +}; + +struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params); + +void clip_free(struct clip_ctx * ctx); + +size_t clip_embd_nbytes(const struct clip_ctx * ctx); +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h); + +int32_t clip_get_image_size (const struct clip_ctx * ctx); +int32_t clip_get_patch_size (const struct clip_ctx * ctx); +int32_t clip_get_hidden_size(const struct clip_ctx * ctx); + +// TODO: should be enum, not string +const char * clip_patch_merge_type(const struct clip_ctx * ctx); + +int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// for M-RoPE, this will be the number of token positions in X and Y directions +// for other models, X will be the total number of tokens and Y will be 1 +int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img); +int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img); + +// this should be equal to the embedding dimension of the text model +int clip_n_mmproj_embd(const struct clip_ctx * ctx); + +struct clip_image_size * clip_image_size_init(void); +struct clip_image_u8 * clip_image_u8_init (void); +struct clip_image_f32 * clip_image_f32_init(void); +struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava + +// nx, ny are the output image dimensions +unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny); + +void clip_image_size_free (struct clip_image_size * img_size); +void clip_image_u8_free (struct clip_image_u8 * img); +void clip_image_f32_free(struct clip_image_f32 * img); +void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); + +// use for accessing underlay data of clip_image_f32_batch +size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size() +size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx +size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny +struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data + +/** + * Build image from pixels decoded by other libraries instead of stb_image.h for better performance. + * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes + */ +void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); + +/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ +bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); + +struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); + +bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); +bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); + +int clip_is_minicpmv(const struct clip_ctx * ctx); +bool clip_is_glm(const struct clip_ctx * ctx); +bool clip_is_llava(const struct clip_ctx * ctx); +// note for contributor: this clip_is_(model) pattern is deprecated +// do NOT add new functions like this + +bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); + +// use by audio input +void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel); + +bool clip_has_vision_encoder(const struct clip_ctx * ctx); +bool clip_has_audio_encoder(const struct clip_ctx * ctx); +bool clip_has_whisper_encoder(const struct clip_ctx * ctx); diff --git a/llama.cpp/tools/mtmd/deprecation-warning.cpp b/llama.cpp/tools/mtmd/deprecation-warning.cpp new file mode 100644 index 0000000..dded0a5 --- /dev/null +++ b/llama.cpp/tools/mtmd/deprecation-warning.cpp @@ -0,0 +1,22 @@ +#include +#include + +int main(int argc, char** argv) { + std::string filename = "main"; + if (argc >= 1) { + filename = argv[0]; + } + + // Get only the program name from the full path + size_t pos = filename.find_last_of("/\\"); + if (pos != std::string::npos) { + filename = filename.substr(pos+1); + } + + fprintf(stdout, "\n"); + fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str()); + fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n"); + fprintf(stdout, "\n"); + + return EXIT_FAILURE; +} diff --git a/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py b/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py new file mode 100644 index 0000000..2949fae --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py @@ -0,0 +1,412 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * +from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel + +TEXT = "clip.text" +VISION = "clip.vision" + + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + # Standardize the transformers llava next keys for + # image newline / mm projector with the classes in haotian-liu LLaVA + if name == "image_newline": + return "model.image_newline" + if name.startswith("multi_modal_projector"): + name = name.replace("multi_modal_projector", "mm") + if "linear_1" in name: + name = name.replace("linear_1", "0") + if "linear_2" in name: + name = name.replace("linear_2", "2") + return name + + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") + +# Selectable visual encoders that are compatible with this script +encoder_group = ap.add_mutually_exclusive_group() +encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False, + help="the visual encoder is Siglip.") + +ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if ( + args.clip_model_is_vision or + not os.path.exists(dir_model + "/vocab.json") or + args.clip_model_is_openclip or + args.clip_model_is_siglip +): + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if args.clip_model_is_vision: + v_hparams = config + t_hparams = None + else: + v_hparams = config["vision_config"] + t_hparams = config["text_config"] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +if args.clip_model_is_siglip: + model = SiglipVisionModel.from_pretrained(dir_model) + processor = None +elif args.clip_model_is_vision or args.clip_model_is_openclip: + model = CLIPVisionModel.from_pretrained(dir_model) + processor = None +else: + model = CLIPModel.from_pretrained(dir_model) + processor = CLIPProcessor.from_pretrained(dir_model) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_llava_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.llava_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_llava_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG) + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_llava_projector", has_llava_projector) +fout.add_file_type(ftype) +model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) +fout.add_name(model_name) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_llava_projector: + fout.add_description("vision-only CLIP model") +elif has_llava_projector: + fout.add_description("image encoder for LLaVA") + # add projector type + fout.add_string("clip.projector_type", args.projector_type) +else: + fout.add_description("two-tower CLIP model") + +if has_text_encoder: + assert t_hparams is not None + assert tokens is not None + if args.clip_model_is_siglip: + text_projection_dim = 0 + else: + text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"]) + # text_model hparams + fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) + fout.add_uint32("clip.text.projection_dim", text_projection_dim) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) + fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) + fout.add_token_list(tokens) + + + +def get_non_negative_vision_feature_layers(v_hparams): + """ + Determine the vision feature layer(s) for the llava model, which are indices into the + hidden states of the visual encoder. Note that the hidden states array generally takes the + form: + + [, , ... ] + + so feature indices should be offset as n+1 to get the output of encoder block n. + We convert all vision feature layers to non-negative so that -1 can be used in + the model as an unset value. If no vision feature layer is found, we leave it unset. + """ + num_hidden_layers = v_hparams["num_hidden_layers"] + to_non_negative = lambda layer_idx: layer_idx if layer_idx >= 0 else num_hidden_layers + layer_idx + 1 + feature_layers_key = None + # Key used for llava models in transformers + if "vision_feature_layer" in config: + feature_layers_key = "vision_feature_layer" + # Key used for llava models in the original format + elif "mm_vision_select_layer" in config: + feature_layers_key = "mm_vision_select_layer" + if feature_layers_key is not None: + feature_layers = config[feature_layers_key] + if isinstance(feature_layers, int): + feature_layers = [feature_layers] + return [to_non_negative(feature_layer) for feature_layer in feature_layers] + +# Determine if we have explicitly specified vision feature layers in our config +feature_layers = get_non_negative_vision_feature_layers(v_hparams) + +if has_vision_encoder: + # Siglip does not have a visual projector; set projection dim to 0 + if args.clip_model_is_siglip: + visual_projection_dim = 0 + else: + visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"]) + + # set vision_model hparams + fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) + fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) + fout.add_uint32("clip.vision.projection_dim", visual_projection_dim) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) + if feature_layers: + block_count = max(feature_layers) + else: + block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + # /** + # "image_grid_pinpoints": [ + # [ + # 336, + # 672 + # ], + # [ + # 672, + # 336 + # ], + # [ + # 672, + # 672 + # ], + # [ + # 1008, + # 336 + # ], + # [ + # 336, + # 1008 + # ] + # ], + # Flattened: + # [ + # 336, 672, + # 672, 336, + # 672, 672, + # 1008, 336, + # 336, 1008 + # ] + # * + # */ + if "image_grid_pinpoints" in v_hparams: + # flatten it + image_grid_pinpoints = [] + for pinpoint in v_hparams["image_grid_pinpoints"]: + for p in pinpoint: + image_grid_pinpoints.append(p) + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + if "image_crop_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"]) + if "image_aspect_ratio" in v_hparams: + fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"]) + if "image_split_resolution" in v_hparams: + fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"]) + if "mm_patch_merge_type" in v_hparams: + fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) + if "mm_projector_type" in v_hparams: + fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) + if feature_layers: + fout.add_array("clip.vision.feature_layer", feature_layers) + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue] + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std # pyright: ignore[reportAttributeAccessIssue] + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = v_hparams["hidden_act"] == "gelu" +fout.add_bool("clip.use_gelu", use_gelu) + + +if has_llava_projector: + # By default, we drop the last layer for llava projector + # models unless we have explicitly set vision feature layers + if feature_layers is None: + model.vision_model.encoder.layers.pop(-1) + else: + model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)] + + projector = torch.load(args.llava_projector) + for name, data in projector.items(): + name = get_tensor_name(name) + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: + data = data.squeeze().numpy().astype(np.float16) + else: + data = data.squeeze().numpy().astype(np.float32) + + fout.add_tensor(name, data) + + print("Projector tensors added\n") + +state_dict = model.state_dict() +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py new file mode 100644 index 0000000..848ef1c --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py @@ -0,0 +1,280 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * + +TEXT = "clip.text" +VISION = "clip.vision" +from transformers import SiglipVisionModel, SiglipVisionConfig + +def k(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if name in ( + "vision_model.head.probe", + "vision_model.head.attention.in_proj_weight", + "vision_model.head.attention.in_proj_bias", + "vision_model.head.attention.out_proj.weight", + "vision_model.head.attention.out_proj.bias", + "vision_model.head.layernorm.weight", + "vision_model.head.layernorm.bias", + "vision_model.head.mlp.fc1.weight", + "vision_model.head.mlp.fc1.bias", + "vision_model.head.mlp.fc2.weight", + "vision_model.head.mlp.fc2.bias" + ): + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.5, 0.5, 0.5] +default_image_std = [0.5, 0.5, 0.5] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + config = json.load(f) + if args.clip_model_is_vision: + v_hparams = config + t_hparams = None + else: + v_hparams = config["vision_config"] + t_hparams = None + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +vision_config = SiglipVisionConfig(**v_hparams) +model = SiglipVisionModel(vision_config) +model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip"))) + +fname_middle = None +has_text_encoder = False +has_vision_encoder = True +has_glm_projector = True +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.llava_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_glm_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_glm_projector", has_glm_projector) +fout.add_file_type(ftype) +model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model) +fout.add_name(model_name) +if has_glm_projector: + fout.add_description("image encoder for glm4v") + fout.add_string("clip.projector_type", "adapter") +else: + fout.add_description("two-tower CLIP model") + +if has_text_encoder: + assert t_hparams is not None + assert tokens is not None + # text_model hparams + fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) + fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) + fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) + fout.add_token_list(tokens) + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) + fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"]) + + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +fout.add_bool("clip.use_gelu", True) + + +if has_glm_projector: + # model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue] + projector = torch.load(args.llava_projector) + for name, data in projector.items(): + name = get_tensor_name(name) + # pw and dw conv ndim==4 + if data.ndim == 2 or data.ndim == 4: + data = data.squeeze().numpy().astype(np.float16) + else: + data = data.squeeze().numpy().astype(np.float32) + if name.startswith("vision."): + name=name.replace("vision.","") + fout.add_tensor(name, data) + print(f"Projector {name} - {data.dtype} - shape = {data.shape}") + # print(f"Projector {name} tensors added\n") + +state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue] +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + # print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + # print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + # print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + print(f"siglip {name} - {data.dtype} - shape = {data.shape}") + # print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py b/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py new file mode 100644 index 0000000..16bb915 --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/glmedge-surgery.py @@ -0,0 +1,33 @@ +import argparse +import os +import torch +from transformers import AutoModel + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to GLM model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True) +checkpoint = model.state_dict() + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +torch.save(projector, f"{args.model}/glm.projector") + +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")] +if len(clip_tensors) > 0: + clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/glm.clip") + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.") diff --git a/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py b/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py new file mode 100644 index 0000000..4f2da3b --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/llava_surgery.py @@ -0,0 +1,38 @@ +import argparse +import glob +import os +import torch + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1] +checkpoint = torch.load(path) + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +torch.save(projector, f"{args.model}/llava.projector") + +# BakLLaVA models contain CLIP tensors in it +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")] +if len(clip_tensors) > 0: + clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/llava.clip") + + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + + + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py b/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py new file mode 100644 index 0000000..b07c3e3 --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/llava_surgery_v2.py @@ -0,0 +1,180 @@ +import argparse +import glob +import os +import torch +from safetensors import safe_open +from safetensors.torch import save_file +from typing import Any, ContextManager, cast + +# Function to determine if file is a SafeTensor file +def is_safetensor_file(file_path): + return file_path.endswith('.safetensors') + + +# Unified loading function +def load_model(file_path): + if is_safetensor_file(file_path): + tensors = {} + with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f: + for key in f.keys(): + tensors[key] = f.get_tensor(key).clone() + # output shape + print(f"{key} : {tensors[key].shape}") + return tensors, 'safetensor' + else: + return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch' + + +# Unified saving function +def save_model(model, file_path, file_type): + if file_type == 'safetensor': + # safe_save(model, file_path) + save_file(model, file_path) + else: + torch.save(model, file_path) + +# Helpers to match weight names from specific components or +# determine if a saved shard contains that component +def is_vision_tower(weight_name): + return ( + weight_name.startswith("model.vision_tower") or + weight_name.startswith("vit.") or + weight_name.startswith("vision_tower") + ) + +def is_newline(weight_name): + return ( + weight_name.startswith("model.image_newline") or + weight_name.startswith("image_newline") + ) + +def is_mm_projector(weight_name): + return ( + weight_name.startswith("model.mm_projector") or + weight_name.startswith("vision_proj.") or + weight_name.startswith("multi_modal_projector") + ) + +def newline_criteria(checkpoint): + return any(is_newline(k) for k in checkpoint.keys()) + +def proj_criteria(checkpoint): + return any(is_mm_projector(k) for k in checkpoint.keys()) + +# Adapted function to clean vision tower from checkpoint +def clean_vision_tower_from_checkpoint(checkpoint_path): + checkpoint, file_type = load_model(checkpoint_path) + # file_type = 'pytorch' + model_path = os.path.dirname(checkpoint_path) + print(f"Searching for vision tower tensors in {checkpoint_path}") + clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)] + + if len(clip_tensors) > 0: + print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") + # Adapted for file type + clip_path = os.path.join(model_path, "llava.clip") + + if os.path.exists(clip_path): + print(f"Loading existing llava.clip from {clip_path}") + existing_clip, _ = load_model(clip_path) + else: + print(f"Creating new llava.clip at {clip_path}") + existing_clip = {} + # Update existing_clip with new tensors, avoid duplicates + for name in clip_tensors: + simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name + print(f"Adding {simple_name} to llava.clip") + if simple_name not in existing_clip: + existing_clip[simple_name] = checkpoint[name] + + # Save the updated clip tensors back to llava.clip + save_model(existing_clip, clip_path, 'pytorch') + + # Remove the tensors from the original checkpoint + for name in clip_tensors: + del checkpoint[name] + + checkpoint_path = checkpoint_path + return True + return False + +def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): + newline_checkpoint_path = None + projector_checkpoint_path = None + + for path in checkpoint_paths: + checkpoint, _ = load_model(path) + if newline_criteria(checkpoint) and newline_checkpoint_path is None: + newline_checkpoint_path = path + if projector(checkpoint): + projector_checkpoint_path = path + + return newline_checkpoint_path, projector_checkpoint_path + + +# Command-line interface setup +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", required=True, help="Path to LLaVA v1.5+ model") +ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files") +args = ap.parse_args() + +if args.clean_vision_tower: + # Generalized to handle both PyTorch and SafeTensors models + model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) + # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))] + checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] + for projector_checkpoint_path in checkpoint_paths: + print(f"Cleaning {projector_checkpoint_path}") + if not clean_vision_tower_from_checkpoint(projector_checkpoint_path): + print(f"No vision tower found in {projector_checkpoint_path}") + # we break once none is found, so far all models append them at the end + # break + print("Done! All vision tower tensors are removed from the model files and stored in llava.clip file.") + +# Now we look for the projector in the last checkpoint +model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True) +checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])] +# last_checkpoint_path = checkpoint_paths[0] +# first_checkpoint_path = checkpoint_paths[-1] +newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria) + +print(f"Taking projector from {projector_checkpoint_path}") +first_mm_tensors = [] +first_checkpoint = None +if newline_checkpoint_path is not None: + print(f"Taking newline from {newline_checkpoint_path}") + first_checkpoint, file_type = load_model(newline_checkpoint_path) + first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)] + +# Load the checkpoint +mm_tensors = [] +last_checkpoint = None +if projector_checkpoint_path is not None: + last_checkpoint, file_type = load_model(projector_checkpoint_path) + mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)] + +if len(mm_tensors) == 0: + if last_checkpoint is not None: + for k, v in last_checkpoint.items(): + print(k) + print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.") + print("No tensors found. Is this a LLaVA model?") + exit() + +print(f"Found {len(mm_tensors)} tensors to extract.") +print(f"Found additional {len(first_mm_tensors)} tensors to extract.") +# projector = {name: checkpoint.[name].float() for name in mm_tensors} +projector = {} +for name in mm_tensors: + assert last_checkpoint is not None + projector[name] = last_checkpoint[name].float() +for name in first_mm_tensors: + assert first_checkpoint is not None + projector[name] = first_checkpoint[name].float() + +if len(projector) > 0: + save_model(projector, f"{args.model}/llava.projector", 'pytorch') + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py new file mode 100644 index 0000000..944037e --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -0,0 +1,892 @@ +# coding=utf-8 +# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Siglip model. """ +# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes + + +import os +import math +import warnings + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn.init import _calculate_fan_in_and_fan_out + +from transformers.activations import ACT2FN +from transformers.modeling_utils import PreTrainedModel +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import ( + logging, +) +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +class SiglipVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a + Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip + [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of channels in the input images. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + Example: + ```python + >>> from transformers import SiglipVisionConfig, SiglipVisionModel + >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration + >>> configuration = SiglipVisionConfig() + >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration + >>> model = SiglipVisionModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "siglip_vision_model" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=16, + hidden_act="gelu_pytorch_tanh", + layer_norm_eps=1e-6, + attention_dropout=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + +_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" + +SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "google/siglip-base-patch16-224", + # See all SigLIP models at https://huggingface.co/models?filter=siglip +] + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + if tensor.dtype in [torch.float16, torch.bfloat16]: + # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu + og_dtype = tensor.dtype + tensor = tensor.to(torch.float32) + tensor.erfinv_() + tensor = tensor.to(og_dtype) + else: + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + if tensor.dtype == torch.float16: + # The `clamp_` op is not (yet?) defined in float16+cpu + tensor = tensor.to(torch.float32) + tensor.clamp_(min=a, max=b) + tensor = tensor.to(torch.float16) + else: + tensor.clamp_(min=a, max=b) + + +def trunc_normal_tf_( + tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0 +): + """Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \\leq \text{mean} \\leq b`. + NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the + bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 + and the result is subsquently scaled and shifted by the mean and std args. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + """ + with torch.no_grad(): + _trunc_normal_(tensor, 0, 1.0, a, b) + tensor.mul_(std).add_(mean) + + +def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + denom = fan_in + if mode == "fan_in": + denom = fan_in + elif mode == "fan_out": + denom = fan_out + elif mode == "fan_avg": + denom = (fan_in + fan_out) / 2 + + variance = scale / denom + + if distribution == "truncated_normal": + # constant is stddev of standard normal truncated to (-2, 2) + trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) + elif distribution == "normal": + with torch.no_grad(): + tensor.normal_(std=math.sqrt(variance)) + elif distribution == "uniform": + bound = math.sqrt(3 * variance) + with torch.no_grad(): + tensor.uniform_(-bound, bound) + else: + raise ValueError(f"invalid distribution {distribution}") + + +def lecun_normal_(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") + + +def default_flax_embed_init(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="normal") + +class SiglipVisionEmbeddings(nn.Module): + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding="valid", + ) + + self.num_patches_per_side = self.image_size // self.patch_size + self.num_patches = self.num_patches_per_side**2 + self.num_positions = self.num_patches + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + +class SiglipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip +class SiglipMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip +class SiglipEncoderLayer(nn.Module): + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.self_attn = ( + SiglipAttention(config) + ) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = SiglipMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + +class SiglipPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SiglipVisionConfig + base_model_prefix = "siglip" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + + if isinstance(module, SiglipVisionEmbeddings): + width = self.config.hidden_size + nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width)) + elif isinstance(module, nn.Embedding): + default_flax_embed_init(module.weight) + elif isinstance(module, SiglipAttention): + nn.init.normal_(module.q_proj.weight) + nn.init.normal_(module.k_proj.weight) + nn.init.normal_(module.v_proj.weight) + nn.init.normal_(module.out_proj.weight) + nn.init.zeros_(module.q_proj.bias) + nn.init.zeros_(module.k_proj.bias) + nn.init.zeros_(module.v_proj.bias) + nn.init.zeros_(module.out_proj.bias) + elif isinstance(module, SiglipMLP): + nn.init.normal_(module.fc1.weight) + nn.init.normal_(module.fc2.weight) + nn.init.normal_(module.fc1.bias, std=1e-6) + nn.init.normal_(module.fc2.bias, std=1e-6) + elif isinstance(module, (nn.Linear, nn.Conv2d)): + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +SIGLIP_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + Parameters: + config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +SIGLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip +class SiglipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`SiglipEncoderLayer`]. + Args: + config: SiglipConfig + """ + + def __init__(self, config: SiglipVisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + +class SiglipVisionTransformer(SiglipPreTrainedModel): + config_class = SiglipVisionConfig + main_input_name = "pixel_values" + _supports_flash_attn_2 = True + + def __init__(self, config: SiglipVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = SiglipVisionEmbeddings(config) + self.encoder = SiglipEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.embeddings.patch_embedding + +import argparse +import json +import re + +import numpy as np +from gguf import * +from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer +from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig + +TEXT = "clip.text" +VISION = "clip.vision" + + +def add_key_str(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_minicpmv and name in ["visual_projection.weight"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.5, 0.5, 0.5] +default_image_std = [0.5, 0.5, 0.5] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) +ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +# Read config.json to get actual model configuration +config_path = os.path.join(dir_model, "config.json") +model_config = {} +if os.path.isfile(config_path): + with open(config_path, "r", encoding="utf-8") as f: + model_config = json.load(f) + print(f"Loaded config from {config_path}") +else: + print(f"Warning: config.json not found at {config_path}") + +# If minicpmv_projector is not specified but the default path exists, use the default path +if args.minicpmv_projector is None: + default_projector_path = os.path.join(dir_model, "minicpmv.projector") + if os.path.isfile(default_projector_path): + args.minicpmv_projector = default_projector_path + print(f"Found default projector file: {default_projector_path}") + +# If output_dir is not specified, use model_dir as the default value +if args.output_dir is None: + args.output_dir = dir_model + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +# if args.clip_model_is_vision or args.clip_model_is_openclip: +# model = CLIPVisionModel.from_pretrained(dir_model) +# processor = None +# else: +# model = CLIPModel.from_pretrained(dir_model) +# processor = CLIPProcessor.from_pretrained(dir_model) + +minicpmv_version = args.minicpmv_version + +# Use actual config values instead of hardcoded ones +if model_config: + # For the projector/resampler, use the main model's hidden_size + emb_dim = model_config.get("hidden_size", 1536) + + # For the vision model, use vision_config values + vision_config_dict = model_config.get("vision_config", {}) + default_vision_config = { + "hidden_size": vision_config_dict.get("hidden_size", 1152), + "image_size": vision_config_dict.get("image_size", 980), + "intermediate_size": vision_config_dict.get("intermediate_size", 4304), + "model_type": vision_config_dict.get("model_type", "siglip"), + "num_attention_heads": vision_config_dict.get("num_attention_heads", 16), + "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27), + "patch_size": vision_config_dict.get("patch_size", 14), + } + + # Use vision model's num_hidden_layers for block_count + block_count = vision_config_dict.get("num_hidden_layers", 27) + + print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}") + print(f"Vision config: {default_vision_config}") +else: + # Fallback to original hardcoded logic if config.json not found + emb_dim = 4096 + block_count = 26 + if minicpmv_version == 1: + emb_dim = 2304 + block_count = 26 + elif minicpmv_version == 2: + emb_dim = 4096 + block_count = 27 + elif minicpmv_version == 3: + emb_dim = 3584 + block_count = 27 + elif minicpmv_version == 4: + emb_dim = 3584 + block_count = 27 + elif minicpmv_version == 5: + emb_dim = 2560 + block_count = 27 + elif minicpmv_version == 6: + emb_dim = 4096 + block_count = 27 + elif minicpmv_version == 100045: + emb_dim = 4096 + block_count = 27 + + default_vision_config = { + "hidden_size": 1152, + "image_size": 980, + "intermediate_size": 4304, + "model_type": "idefics2", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } + +vision_config = Idefics2VisionConfig(**default_vision_config) +model = Idefics2VisionTransformer(vision_config) +if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"): + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 4: + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 5: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 6: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 100045: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) + +processor = None +# if model.attn_pool is not None: +# model.attn_pool = torch.nn.Identity() + +# model.blocks = model.blocks[:-1] +model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip"))) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_minicpmv_projector = False + +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.minicpmv_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_minicpmv_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector) +fout.add_file_type(ftype) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_minicpmv_projector: + fout.add_description("vision-only CLIP model") +elif has_minicpmv_projector: + fout.add_description("image encoder for MiniCPM-V") + # add projector type + fout.add_string("clip.projector_type", "resampler") + fout.add_int32("clip.minicpmv_version", minicpmv_version) +else: + fout.add_description("two-tower CLIP model") + +if has_vision_encoder: + # vision_model hparams - use actual config values + vision_image_size = model_config.get("image_size", 448) if model_config else 448 + vision_patch_size = default_vision_config.get("patch_size", 14) + vision_hidden_size = default_vision_config.get("hidden_size", 1152) + vision_intermediate_size = default_vision_config.get("intermediate_size", 4304) + vision_attention_heads = default_vision_config.get("num_attention_heads", 16) + + fout.add_uint32("clip.vision.image_size", vision_image_size) + fout.add_uint32("clip.vision.patch_size", vision_patch_size) + fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size) + fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size) + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads) + fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) + + # Add MiniCPM-V specific parameters + query_num = model_config.get("query_num", 0) if model_config else 0 + resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0 + fout.add_uint32("clip.minicpmv_query_num", query_num) + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = True +fout.add_bool("clip.use_gelu", use_gelu) + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + +def _replace_name_resampler(s, v): + if re.match("resampler.pos_embed", s): + return { + s: v, + re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))), + } + if re.match("resampler.proj", s): + return { + re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))), + re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), + } + if re.match("resampler.attn.in_proj_.*", s): + return { + re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0], + re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1], + re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2], + } + return {s: v} + +if has_minicpmv_projector: + projector = torch.load(args.minicpmv_projector) + new_state_dict = {} + for k, v in projector.items(): + kvs = _replace_name_resampler(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv + projector = new_state_dict + ftype_cur = 0 + for name, data in projector.items(): + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + if ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + fout.add_tensor(name, data) + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + + print("Projector tensors added\n") + +def _replace_name(s, v): + s = "vision_model." + s + if re.match("vision_model.embeddings.position_embedding", s): + v = v.unsqueeze(0) + return {s: v} + + return {s: v} + +state_dict = model.state_dict() +new_state_dict = {} +for k, v in state_dict.items(): + kvs = _replace_name(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv +state_dict = new_state_dict +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py b/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py new file mode 100644 index 0000000..5352662 --- /dev/null +++ b/llama.cpp/tools/mtmd/legacy-models/minicpmv-surgery.py @@ -0,0 +1,47 @@ +import argparse +import os +import torch +from transformers import AutoModel, AutoTokenizer + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to MiniCPM-V model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16) +checkpoint = model.state_dict() + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True: + projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb +torch.save(projector, f"{args.model}/minicpmv.projector") + +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] +if len(clip_tensors) > 0: + clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/minicpmv.clip") + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + +config = model.llm.config +config.auto_map = { + "AutoConfig": "configuration_minicpm.MiniCPMConfig", + "AutoModel": "modeling_minicpm.MiniCPMModel", + "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification" +} +model.llm.save_pretrained(f"{args.model}/model") +tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) +tok.save_pretrained(f"{args.model}/model") + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.") diff --git a/llama.cpp/tools/mtmd/models/cogvlm.cpp b/llama.cpp/tools/mtmd/models/cogvlm.cpp new file mode 100644 index 0000000..d5b739c --- /dev/null +++ b/llama.cpp/tools/mtmd/models/cogvlm.cpp @@ -0,0 +1,98 @@ +#include "models.h" + +ggml_cgraph * clip_graph_cogvlm::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // build input and concatenate class embedding + ggml_tensor * inp = build_inp(); + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "inp_pos", -1); + + ggml_tensor * inpL = inp; + + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 0); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], n_embd * sizeof(float)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 2 * n_embd * sizeof(float)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "layer_out", il); + inpL = cur; + + } + + // remove CLS token (like build_llama4 does) + ggml_tensor * cur = ggml_view_2d(ctx0, inpL, + n_embd, n_patches, + ggml_row_size(inpL->type, n_embd), 0); + + // Multiply with mm_model_proj + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + + // Apply layernorm, weight, bias + cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + + // Apply GELU + cur = ggml_gelu_inplace(ctx0, cur); + + // Branch 1: multiply with mm_h_to_4h_w + ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + + // Branch 2: multiply with mm_gate_w + ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + + // Apply silu + gate = ggml_swiglu_split(ctx0, gate, h_to_4h); + + // Apply mm_4h_to_h_w + cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); + + // Concatenate with boi and eoi + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/conformer.cpp b/llama.cpp/tools/mtmd/models/conformer.cpp new file mode 100644 index 0000000..9b1fab4 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/conformer.cpp @@ -0,0 +1,216 @@ +#include "models.h" + +ggml_cgraph * clip_graph_conformer::build() { + const int n_frames = img.nx; + const int n_pos = n_frames / 2; + const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1; + GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); + + ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd); + ggml_set_name(pos_emb, "pos_emb"); + ggml_set_input(pos_emb); + ggml_build_forward_expand(gf, pos_emb); + + ggml_tensor * inp = build_inp_raw(1); + + auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + + // pre encode, conv subsampling + { + // layer.0 - conv2d + cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]); + cb(cur, "conformer.pre_encode.conv.{}", 0); + + // layer.1 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // layer.2 conv2d dw + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]); + cb(cur, "conformer.pre_encode.conv.{}", 2); + + // layer.3 conv2d + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]); + cb(cur, "conformer.pre_encode.conv.{}", 3); + + // layer.4 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // layer.5 conv2d dw + cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]); + cb(cur, "conformer.pre_encode.conv.{}", 5); + + // layer.6 conv2d + cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1); + cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]); + cb(cur, "conformer.pre_encode.conv.{}", 6); + + // layer.7 - relu + cur = ggml_relu_inplace(ctx0, cur); + + // flatten channel and frequency axis + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3)); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); + + // calculate out + cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur); + cur = ggml_add(ctx0, cur, model.pre_encode_out_b); + cb(cur, "conformer.pre_encode.out", -1); + } + + // pos_emb + cb(pos_emb, "pos_emb", -1); + + for (int il = 0; il < hparams.n_layer; il++) { + const auto & layer = model.layers[il]; + + auto * residual = cur; + + cb(cur, "layer.in", il); + + // feed_forward1 + cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_feed_forward1", il); + + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU, + il); + cb(cur, "conformer.layers.{}.feed_forward1.linear2", il); + + const auto fc_factor = 0.5f; + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + + // self-attention + { + cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_self_att", il); + + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]); + ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u); + Q_bias_u = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3); + ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v); + Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3); + + // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]); + Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3)); + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]); + Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3)); + + // build_attn won't fit due to matrix_ac and matrix_bd separation + ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur); + matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3)); + cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il); + + auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb); + cb(p, "conformer.layers.{}.self_attn.linear_pos", il); + p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]); + p = ggml_permute(ctx0, p, 0, 2, 1, 3); + + auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p); + matrix_bd = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3)); + + // rel shift + { + const auto pos_len = matrix_bd->ne[0]; + const auto q_len = matrix_bd->ne[1]; + const auto h = matrix_bd->ne[2]; + matrix_bd = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0); + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h); + matrix_bd = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1], + matrix_bd->nb[2], matrix_bd->nb[0] * q_len); + matrix_bd = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h); + } + + matrix_bd = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1], + matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0); + auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd); + scores = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head)); + cb(scores, "conformer.layers.{}.self_attn.id0", il); + + ggml_tensor * attn = ggml_soft_max(ctx0, scores); + ggml_tensor * x = ggml_mul_mat(ctx0, attn, Vcur); + x = ggml_permute(ctx0, x, 2, 0, 1, 3); + x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]); + + ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x); + out = ggml_add(ctx0, out, layer.o_b); + cb(out, "conformer.layers.{}.self_attn.linear_out", il); + + cur = out; + } + + residual = ggml_add(ctx0, residual, cur); + cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_conv", il); + + // conv + { + auto * x = cur; + x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x); + x = ggml_add(ctx0, x, layer.conv_pw1_b); + cb(x, "conformer.layers.{}.conv.pointwise_conv1", il); + + // ggml_glu doesn't support sigmoid + // TODO @ngxson : support this ops in ggml + { + int64_t d = x->ne[0] / 2; + ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + } + + // use ggml_ssm_conv for f32 precision + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_roll(ctx0, x, 4, 0, 0, 0); + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w); + x = ggml_add(ctx0, x, layer.conv_dw_b); + + x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); + x = ggml_silu(ctx0, x); + + // pointwise_conv2 + x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x); + x = ggml_add(ctx0, x, layer.conv_pw2_b); + + cur = x; + } + + residual = ggml_add(ctx0, residual, cur); + + cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_feed_forward2", il); + + cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b, + FFN_SILU, il); // TODO(tarek): read activation for ffn from hparams + cb(cur, "conformer.layers.{}.feed_forward2.linear2", il); + + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor)); + cb(residual, "conformer.layers.{}.conv.id", il); + + cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il); + cb(cur, "conformer.layers.{}.norm_out", il); + } + + // audio adapter + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cb(cur, "audio_adapter.model.{}", 0); + cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1); + + cb(cur, "projected", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/glm4v.cpp b/llama.cpp/tools/mtmd/models/glm4v.cpp new file mode 100644 index 0000000..f39b692 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/glm4v.cpp @@ -0,0 +1,120 @@ +#include "models.h" + +ggml_cgraph * clip_graph_glm4v::build() { + GGML_ASSERT(model.patch_bias != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + + norm_type norm_t = NORM_TYPE_RMS; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches * 4); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + // add patch bias + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + + // pos-conv norm + inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1); + + // calculate absolute position embedding and apply + ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + cb(learned_pos_embd, "learned_pos_embd", -1); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return ggml_rope_multi( + ctx0, cur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, + 32768, hparams.rope_theta, 1, 0, 1, 32, 1); + }; + + ggml_tensor * cur = build_vit( + inp, n_patches, + norm_t, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + // cb(ggml_sum(ctx0, cur), "vit_out_sum", -1); + + // GLM4V projector + // ref: https://github.com/huggingface/transformers/blob/40dc11cd3eb4126652aa41ef8272525affd4a636/src/transformers/models/glm4v/modeling_glm4v.py#L116-L130 + + // patch merger (downsample) + { + int n_merge = hparams.n_merge; + GGML_ASSERT(n_merge > 0); + + int n_token_out = n_patches / n_merge / n_merge; + cur = ggml_reshape_4d(ctx0, cur, n_embd, n_merge, n_merge, n_token_out); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); // [n_merge, n_merge, n_embd, n_token_out] + cur = ggml_conv_2d(ctx0, model.mm_patch_merger_w, cur, n_merge, n_merge, 0, 0, 1, 1); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[2], n_token_out); // [n_embd_out, n_token_out] + + cur = ggml_add(ctx0, cur, model.mm_patch_merger_b); + } + + // FC projector + { + cur = ggml_mul_mat(ctx0, model.projection, cur); + // default LayerNorm (post_projection_norm) + cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "after_fc_proj", -1); + } + + // FFN projector + { + cur = build_ffn(cur, + model.mm_ffn_up_w, model.mm_ffn_up_b, + model.mm_ffn_gate_w, model.mm_ffn_gate_b, + model.mm_ffn_down_w, model.mm_ffn_down_b, + hparams.ffn_op, -1); + cb(cur, "after_ffn_proj", -1); + // cb(ggml_sum(ctx0, cur), "merged_sum", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/internvl.cpp b/llama.cpp/tools/mtmd/models/internvl.cpp new file mode 100644 index 0000000..9aded3b --- /dev/null +++ b/llama.cpp/tools/mtmd/models/internvl.cpp @@ -0,0 +1,69 @@ +#include "models.h" + +ggml_cgraph * clip_graph_internvl::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; + ggml_tensor * inp = build_inp(); + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // The larger models use a different ViT, which uses RMS norm instead of layer norm + // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 + norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) + ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) + : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) + + ggml_tensor * cur = build_vit( + inp, n_pos, + norm_t, + hparams.ffn_op, + model.position_embeddings, + nullptr); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + { + const int scale_factor = model.hparams.n_merge; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = n_patches_y; + const int width = n_patches_x; + GGML_ASSERT(scale_factor > 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont_4d(ctx0, cur, + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_cont_2d(ctx0, cur, + n_embd * scale_factor * scale_factor, + cur->ne[1] * cur->ne[2]); + } + + // projector (always using GELU activation) + { + // projector LayerNorm uses pytorch's default eps = 1e-5 + // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_3_w, model.mm_3_b, + FFN_GELU, + -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/kimik25.cpp b/llama.cpp/tools/mtmd/models/kimik25.cpp new file mode 100644 index 0000000..cf9f27f --- /dev/null +++ b/llama.cpp/tools/mtmd/models/kimik25.cpp @@ -0,0 +1,101 @@ +#include "models.h" +#include +#include + +// note: this is similar to clip_graph::resize_position_embeddings, major difference is having +// the w/h in ne[1] and ne[2] instead of assuming with sqrt. Could try storing the tensor in 2D instead +// with a w*h? Also the permute is a bit different at (2, 1, 0, 3) instead of (2, 0, 1, 3). +ggml_tensor * clip_graph_kimik25::resize_position_embeddings_3d(uint32_t interpolation_mode) { + ggml_tensor * pos_embd = model.position_embeddings; + const int height = img.ny / patch_size; + const int width = img.nx / patch_size; + const uint32_t mode = interpolation_mode; + + GGML_ASSERT(pos_embd); + + const int64_t stored_c = pos_embd->ne[0]; // C = 1152 + const int64_t orig_w = pos_embd->ne[1]; // W = 64 + const int64_t orig_h = pos_embd->ne[2]; // H = 64 + + GGML_ASSERT(stored_c == n_embd); + + if (height == (int)orig_h && width == (int)orig_w) { + // No interpolation needed, just flatten to [C, H*W] + return ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); + } + + pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3); + pos_embd = ggml_interpolate(ctx0, pos_embd, height, width, n_embd, 1, mode); + pos_embd = ggml_permute(ctx0, pos_embd, 2, 1, 0, 3); + pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); + return pos_embd; +} + +ggml_cgraph * clip_graph_kimik25::build() { + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC); + + // Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but + // Q / K are permuted during conversion to use split format. + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + return cur; + }; + + ggml_tensor * inp = build_inp(); + + // I don't know why, but doing this in the build_vit lead to the ggml_add not occurring? + // Doing it manually here does work. + inp = ggml_add(ctx0, inp, learned_pos_embd); + + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + nullptr, + add_pos); + + cb(cur, "vit_out", -1); + + { + // patch_merger + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection norm + int proj_inp_dim = cur->ne[0]; + int n_merged_patches = cur->ne[1]; + cur = ggml_view_2d(ctx0, cur, + n_embd, n_merged_patches * scale_factor * scale_factor, + ggml_row_size(cur->type, n_embd), 0); + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + cur = ggml_view_2d(ctx0, cur, + proj_inp_dim, n_merged_patches, + ggml_row_size(cur->type, proj_inp_dim), 0); + cb(cur, "proj_inp_normed", -1); + + // projection mlp + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + + cb(cur, "proj_out", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/kimivl.cpp b/llama.cpp/tools/mtmd/models/kimivl.cpp new file mode 100644 index 0000000..0a06f50 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/kimivl.cpp @@ -0,0 +1,63 @@ +#include "models.h" + +ggml_cgraph * clip_graph_kimivl::build() { + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + + { + // patch_merger + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection norm + int proj_inp_dim = cur->ne[0]; + cur = ggml_view_2d(ctx0, cur, + n_embd, cur->ne[1] * scale_factor * scale_factor, + ggml_row_size(cur->type, n_embd), 0); + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + cur = ggml_view_2d(ctx0, cur, + proj_inp_dim, cur->ne[1] / scale_factor / scale_factor, + ggml_row_size(cur->type, proj_inp_dim), 0); + cb(cur, "proj_inp_normed", -1); + + // projection mlp + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + cb(cur, "proj_out", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/llama4.cpp b/llama.cpp/tools/mtmd/models/llama4.cpp new file mode 100644 index 0000000..30d1df5 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/llama4.cpp @@ -0,0 +1,96 @@ +#include "models.h" + +ggml_cgraph * clip_graph_llama4::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * inp = build_inp_raw(); + + // Llama4UnfoldConvolution + { + ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0, + patch_size, patch_size, 3, n_embd); + inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type); + inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); + cb(inp, "patch_conv", -1); + } + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312 + // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441 + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + model.position_embeddings, + add_pos); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + // based on Llama4VisionPixelShuffleMLP + // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151 + { + const int scale_factor = model.hparams.n_merge; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + GGML_ASSERT(scale_factor > 0); + GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images + cur = ggml_reshape_4d(ctx0, cur, + n_embd * scale_factor, + n_patches_x / scale_factor, + n_patches_y, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont_4d(ctx0, cur, + n_embd * scale_factor * scale_factor, + n_patches_x / scale_factor, + n_patches_y / scale_factor, + bsz); + //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_cont_2d(ctx0, cur, + n_embd * scale_factor * scale_factor, + n_patches / scale_factor / scale_factor); + cb(cur, "pixel_shuffle", -1); + } + + // based on Llama4VisionMLP2 (always uses GELU activation, no bias) + { + cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur); + cur = ggml_gelu(ctx0, cur); + cb(cur, "adapter_mlp", -1); + } + + // Llama4MultiModalProjector + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + cb(cur, "projected", -1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/llava.cpp b/llama.cpp/tools/mtmd/models/llava.cpp new file mode 100644 index 0000000..0bfb5f0 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/llava.cpp @@ -0,0 +1,374 @@ +#include "models.h" + +// this graph is used by llava, granite and glm +// due to having embedding_stack (used by granite), we cannot reuse build_vit +ggml_cgraph * clip_graph_llava::build() { + const int batch_size = 1; + const int n_pos = n_patches + (model.class_embedding ? 1 : 0); + + GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); + + // Calculate the deepest feature layer based on hparams and projector type + int max_feature_layer = n_layer; + { + // Get the index of the second to last layer; this is the default for models that have a llava projector + int il_last = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) { + il_last += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + // NOTE: only used by granite-vision models for now + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer; + } + + ggml_tensor * inp = build_inp(); + + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + } + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1); + cb(inpL, "pre_ln", -1); + } + + std::vector embedding_stack; + const auto & vision_feature_layer = hparams.vision_feature_layer; + + // loop over layers + for (int il = 0; il < max_feature_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // If this is an embedding feature layer, save the output. + // NOTE: 0 index here refers to the input to the encoder. + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + embedding_stack.push_back(cur); + } + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "layer_inp_normed", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1); + } + + ggml_tensor * embeddings = inpL; + + // process vision feature layers (used by granite) + { + // final layer is a vision feature layer + if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { + embedding_stack.push_back(inpL); + } + + // If feature layers are explicitly set, stack them (if we have multiple) + if (!embedding_stack.empty()) { + embeddings = embedding_stack[0]; + for (size_t i = 1; i < embedding_stack.size(); i++) { + embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); + } + } + } + + // llava projector (also used by granite) + if (hparams.has_llava_projector) { + embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); + + ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(patches, "patches"); + ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + embeddings = ggml_get_rows(ctx0, embeddings, patches); + + // print_tensor_info(embeddings, "embeddings"); + + // llava projector + if (proj_type == PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + embeddings = ggml_gelu(ctx0, embeddings); + if (model.mm_2_w) { + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } + } + else if (proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); + } + else if (proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projector + int n_patch = 24; + ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); + mlp_1 = ggml_gelu(ctx0, mlp_1); + ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] + + // block 1 + ggml_tensor * block_1 = nullptr; + { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3); + mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + // stride = 1, padding = 1, bias is nullptr + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + + // layer norm + // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // residual + block_1 = ggml_add(ctx0, mlp_3, block_1); + } + + // block_2 + { + // stride = 2 + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // layer norm + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + // not sure the parameters is right for globalAvgPooling + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); + // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] + } + embeddings = block_1; + } + else if (proj_type == PROJECTOR_TYPE_LDPV2) + { + int n_patch = 24; + ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } + else { + GGML_ABORT("fatal error"); + } + } + + // glm projector + else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) { + size_t gridsz = (size_t)sqrt(embeddings->ne[1]); + embeddings = ggml_permute(ctx0,embeddings,1,0,2,3); + embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); + embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); + embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); + // GLU + { + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + embeddings = ggml_gelu_inplace(ctx0, embeddings); + ggml_tensor * x = embeddings; + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); + x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = ggml_swiglu_split(ctx0, embeddings, x); + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); + } + // arrangement of BOI/EOI token embeddings + // note: these embeddings are not present in text model, hence we cannot process them as text tokens + // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53 + { + embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI + embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI + } + } + + else { + GGML_ABORT("llava: unknown projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/minicpmv.cpp b/llama.cpp/tools/mtmd/models/minicpmv.cpp new file mode 100644 index 0000000..3594ea2 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/minicpmv.cpp @@ -0,0 +1,114 @@ +#include "models.h" + +ggml_cgraph * clip_graph_minicpmv::build() { + GGML_ASSERT(model.class_embedding == nullptr); + const int n_pos = n_patches; + const int n_embd_proj = n_mmproj_embd; + + // position embeddings for the projector (not for ViT) + // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70 + // base frequency omega + ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4); + ggml_set_name(omega, "omega"); + ggml_set_input(omega); + + // 2D input positions (using float for sinusoidal embeddings) + ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + // for selecting learned pos embd, used by ViT + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + + ggml_tensor * inp = build_inp(); + ggml_tensor * embeddings = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + // resampler projector (it is just another transformer) + + ggml_tensor * q = model.mm_model_query; + ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + + // norm + q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); + v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1); + + // calculate sinusoidal pos embd + ggml_tensor * pos_embed = nullptr; + { + // outer product + ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows + ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w); + ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h); + // sin and cos + ggml_tensor * pos_embd_x = ggml_concat( + ctx0, + ggml_sin(ctx0, theta_x), + ggml_cos(ctx0, theta_x), + 0 // concat on first dim + ); + ggml_tensor * pos_embd_y = ggml_concat( + ctx0, + ggml_sin(ctx0, theta_y), + ggml_cos(ctx0, theta_y), + 0 // concat on first dim + ); + pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0); + } + + // k = v + pos_embed + ggml_tensor * k = ggml_add(ctx0, v, pos_embed); + + // attention + { + const int d_head = 128; + int n_head = n_embd_proj/d_head; + // Use actual config value if available, otherwise fall back to hardcoded values + int num_query = hparams.minicpmv_query_num; + ggml_tensor * Q = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), + model.mm_model_attn_q_b); + ggml_tensor * K = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), + model.mm_model_attn_k_b); + ggml_tensor * V = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), + model.mm_model_attn_v_b); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); + K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos); + V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos); + + cb(Q, "resampler_Q", -1); + cb(K, "resampler_K", -1); + cb(V, "resampler_V", -1); + + float resampler_kq_scale = 1.0f/ sqrtf(float(d_head)); + embeddings = build_attn( + model.mm_model_attn_o_w, + model.mm_model_attn_o_b, + Q, K, V, nullptr, resampler_kq_scale, -1); + cb(embeddings, "resampler_attn_out", -1); + } + // layernorm + embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); + + // projection + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/mobilenetv5.cpp b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp new file mode 100644 index 0000000..593afa1 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/mobilenetv5.cpp @@ -0,0 +1,451 @@ +#include "models.h" + +// Helpers for MobileNetV5 Blocks +// RMS Norm 2D - normalizes over channels for each spatial position +ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) { + // inp: [W, H, C, B] + + ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); + cur = ggml_cont(ctx0, cur); + cur = ggml_rms_norm(ctx0, cur, eps); + + if (weight) { + cur = ggml_mul(ctx0, cur, weight); + } + + cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); + cur = ggml_cont(ctx0, cur); + + return cur; +} + +// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF +ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { + const int64_t ih = inp->ne[1]; // height + const int64_t iw = inp->ne[0]; // width + + // Calculate output size (ceil division) + const int64_t oh = (ih + stride_h - 1) / stride_h; + const int64_t ow = (iw + stride_w - 1) / stride_w; + + // Calculate padding needed + const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih); + const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw); + + // Split padding asymmetrically + const int pad_h_top = pad_h / 2; + const int pad_h_bottom = pad_h - pad_h_top; + const int pad_w_left = pad_w / 2; + const int pad_w_right = pad_w - pad_w_left; + + // Apply padding if needed + // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3) + // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch + if (pad_h > 0 || pad_w > 0) { + inp = ggml_pad_ext(ctx0, inp, + pad_w_left, pad_w_right, // width padding (dim 0) + pad_h_top, pad_h_bottom, // height padding (dim 1) + 0, 0, // no channel padding (dim 2) + 0, 0); // no batch padding (dim 3) + } + + return inp; +} + + +// Edge Residual Block (Stage 0) +ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { + ggml_tensor * cur = inp; + + // 1. Expansion Conv (3x3) + if (stride == 2) { + // Case: Downsampling (Block 0) + // Replicates Conv2dSame(kernel=3, stride=2) + cur = pad_same_2d(cur, 3, 3, stride, stride); + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1); + } else { + // Case: Normal 3x3 Block (Block 1, 2) + // Replicates Conv2d(kernel=3, stride=1, padding=1) + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1); + } + + // BN + Activation + if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w); + cur = ggml_gelu(ctx0, cur); + + // 2. Pointwise Linear Conv (1x1) + // 1x1 Convs usually have padding=0 and stride=1 + cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1); + if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w); + + // 3. Residual Connection + // Only apply residual if spatial dimensions and channels match (stride 1) + if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) { + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +// Universal Inverted Residual Block (Stage 1+) +ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) { + ggml_tensor * cur = inp; + + // 1. Depthwise Start (Optional) + // NOTE: dw_start always has stride=1 (no downsampling here) + if (block.dw_start_w) { + int k = block.dw_start_w->ne[0]; // 3 or 5 + int p = k / 2; + cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1); + if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w); + } + + // 2. Pointwise Expansion (1x1) + if (block.pw_exp_w) { + // Standard 1x1 conv, pad=0, stride=1 + cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1); + if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w); + cur = ggml_gelu(ctx0, cur); + } + + // 3. Depthwise Mid (Optional) + // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage) + if (block.dw_mid_w) { + int k = block.dw_mid_w->ne[0]; // 3 or 5 + + if (stride > 1) { + // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding + cur = pad_same_2d(cur, k, k, stride, stride); + cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0 + } else { + // Case: Stride 1 -> Use Standard Symmetric Padding + int p = k / 2; + cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1); + } + + if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w); + cur = ggml_gelu(ctx0, cur); + } + + // 4. Pointwise Projection (1x1) + if (block.pw_proj_w) { + cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1); + if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w); + } + + // Apply Layer Scaling if present + if (block.layer_scale_w) { + cur = ggml_mul(ctx0, cur, block.layer_scale_w); + } + + // 5. Residual Connection + bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]); + bool same_channel = (inp->ne[2] == cur->ne[2]); + if (same_spatial && same_channel) { + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +// Attention Block (MQA) +ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) { + ggml_tensor * cur = inp; + + // Norm + if (block.attn_norm_w) { + cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f); + } + + // 1. Q Calculation + ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1); + + // 2. K Calculation (Downsampled) + // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) + ggml_tensor * k_inp = cur; + if (block.attn_k_dw_w) { + int k_size = block.attn_k_dw_w->ne[0]; // Usually 3 + k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding + k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0 + if (block.attn_k_norm_w) { + k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f); + } + } + ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1); + + // 3. V Calculation (Downsampled) + // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640) + ggml_tensor * v_inp = cur; + if (block.attn_v_dw_w) { + int v_size = block.attn_v_dw_w->ne[0]; // Usually 3 + v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding + v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0 + if (block.attn_v_norm_w) { + v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f); + } + } + ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1); + + const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3]; + const int D = k->ne[2]; // Head dimension + const int n_head = q->ne[2] / D; + const int N = W * H; + + // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B] + q = ggml_reshape_3d(ctx0, q, N, D*n_head, B); + q = ggml_reshape_4d(ctx0, q, N, D, n_head, B); + q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B] + q = ggml_cont(ctx0, q); + + const int Wk = k->ne[0]; const int Hk = k->ne[1]; + const int M = Wk * Hk; + + // Process K: [Wk, Hk, D, B] -> [D, M, 1, B] + k = ggml_reshape_3d(ctx0, k, M, D, B); + k = ggml_reshape_4d(ctx0, k, M, D, 1, B); + k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B] + k = ggml_cont(ctx0, k); + + // Process V: [Wk, Hk, D, B] -> [M, D, 1, B] + v = ggml_reshape_3d(ctx0, v, M, D, B); + v = ggml_reshape_4d(ctx0, v, M, D, 1, B); + v = ggml_cont(ctx0, v); // [M, D, 1, B] + + // Multi-Query Attention + float scale = 1.0f / sqrtf((float)D); + + // Step 1: Compute Q @ K.T + ggml_tensor * scores = ggml_mul_mat(ctx0, k, q); + + scores = ggml_scale(ctx0, scores, scale); + + scores = ggml_soft_max(ctx0, scores); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores); + + kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3); + kqv = ggml_cont(ctx0, kqv); + + + kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B); + kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B); + kqv = ggml_cont(ctx0, kqv); + + // Output projection + cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1); + + // Residual & Layer Scale + if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) { + if (block.layer_scale_w) { + cur = ggml_mul(ctx0, cur, block.layer_scale_w); + } + cur = ggml_add(ctx0, cur, inp); + } + + return cur; +} + +ggml_cgraph * clip_graph_mobilenetv5::build() { + ggml_tensor * inp = build_inp_raw(); + + // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2)) + ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding + + cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0 + if (model.mobilenet_stem_conv_b) { + cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b); + } + if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w); + cur = ggml_gelu(ctx0, cur); + + + // 2. Blocks + std::vector intermediate_features; + const int total_blocks = model.mobilenet_blocks.size(); + + auto is_stage_start = [&](int i) { + if (i == 0) return true; + for (int end_idx : model.mobilenet_stage_ends) { + if (i == end_idx + 1) return true; + } + return false; + }; + + auto is_fusion_point = [&](int i) { + if (model.mobilenet_stage_ends.size() >= 4) { + if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2 + if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3 + } else { + if (i == total_blocks - 1) return true; + } + return false; + }; + + for (int i = 0; i < total_blocks; i++) { + const auto & block = model.mobilenet_blocks[i]; + int stride = is_stage_start(i) ? 2 : 1; + + if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride); + else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block); + else cur = build_inverted_residual(cur, block, stride); + + if (is_fusion_point(i)) { + + intermediate_features.push_back(cur); + } + } + + // 3. Multi-Scale Fusion Adapter (MSFA) + if (!intermediate_features.empty()) { + + // A. Reference Resolution: PyTorch implementation uses inputs[0] + // We assume intermediate_features[0] is the "High Resolution" target. + // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32). + ggml_tensor* target_feat = intermediate_features[0]; + int high_res_w = target_feat->ne[0]; + int high_res_h = target_feat->ne[1]; + + std::vector resized_feats; + + // B. Resize inputs to match inputs[0] (High Resolution) + for (auto feat : intermediate_features) { + int feat_w = feat->ne[0]; + int feat_h = feat->ne[1]; + + // PyTorch: if feat_size < high_resolution: interpolate + if (feat_w < high_res_w || feat_h < high_res_h) { + // Calculate scale factor. + // Note: PyTorch 'nearest' works on arbitrary float scales. + // ggml_upscale generally takes integer factors or target sizes depending on helper. + // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2). + int scale_w = high_res_w / feat_w; + // int scale_h = high_res_h / feat_h; + + // Safety check for non-integer scaling if strictly replicating + GGML_ASSERT(high_res_w % feat_w == 0); + + // Upsample (Nearest Neighbor) + // 2 is the scale factor + feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); + } + resized_feats.push_back(feat); + } + + // C. Concatenate at High Resolution (Channel Dim = 2 in ggml) + cur = resized_feats[0]; + for (size_t k = 1; k < resized_feats.size(); ++k) { + cur = ggml_concat(ctx0, cur, resized_feats[k], 2); + } + + // D. FFN (UniversalInvertedResidual) + // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm + + // 1. Expansion + if (model.msfa_ffn_expand_w) { + // 1x1 Conv + cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1); + + if (model.msfa_ffn_expand_bn) { + cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn); + } + + cur = ggml_gelu(ctx0, cur); + + } + + // 2. Projection (No DW because kernel_size=0) + if (model.msfa_ffn_project_w) { + // 1x1 Conv + cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1); + + // UniversalInvertedResidual typically has a norm after projection + if (model.msfa_ffn_project_bn) { + cur = rms_norm_2d(cur, model.msfa_ffn_project_bn); + } + + } + + // E. Final Downsample to Target Resolution (Output Resolution) + // PyTorch: matches self.output_resolution (e.g. 16x16) + const int target_out_res = 16; + int current_w = cur->ne[0]; + + if (current_w > target_out_res) { + int s = current_w / target_out_res; + + GGML_ASSERT(current_w % target_out_res == 0); + + // Avg Pool: Kernel=s, Stride=s + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0); + + } + + // F. Final Norm + if (model.msfa_concat_norm_w) { + cur = rms_norm_2d(cur, model.msfa_concat_norm_w); + + } + } + + // 4. Gemma 3n Multimodal Projection (Embedder) + // Input: 'cur' is [Width, Height, Channels, Batch] + int W = cur->ne[0]; + int H = cur->ne[1]; + int C = cur->ne[2]; + int B = cur->ne[3]; + + GGML_ASSERT(C == hparams.n_embd); + + // 1. Permute and Flatten to [Channels, Tokens, Batch] + // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch) + cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B] + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B] + cur = ggml_cont(ctx0, cur); + cur = ggml_reshape_3d(ctx0, cur, C, W*H, B); + cur = ggml_cont(ctx0, cur); + + + // 2. FEATURE SCALING + // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5 + const float scale_factor = sqrtf((float)C); + cur = ggml_scale(ctx0, cur, scale_factor); + + + // 3. SOFT EMBEDDING NORM + // PyTorch: self._norm(x) * self.weight + // We must normalize regardless, then multiply if weight exists. + { + const float eps = 1e-6f; // Gemma3n uses 1e-6 + cur = ggml_rms_norm(ctx0, cur, eps); + + if (model.mm_soft_emb_norm_w) { + // Weight shape is (2048,) -> Element-wise broadcast multiply + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + } + + } + + // 4. PROJECTION + // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False) + // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size] + if (model.mm_input_proj_w) { + cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur); + } + + // 5. POST PROJECTION NORM + // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False) + // with_scale=False means weight is registered as buffer with value 1.0 + // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1 + { + const float eps = 1e-6f; + cur = ggml_rms_norm(ctx0, cur, eps); + + if (model.mm_post_proj_norm_w) { + // If weight is loaded, multiply (should be ~1.0 anyway) + cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w); + } + } + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/models.h b/llama.cpp/tools/mtmd/models/models.h new file mode 100644 index 0000000..c4c67ac --- /dev/null +++ b/llama.cpp/tools/mtmd/models/models.h @@ -0,0 +1,118 @@ +#pragma once + +#include "../clip-graph.h" + +/* + * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated. + * We encourage human contributors to ensure the quality and reliability of the codebase. + */ + +struct clip_graph_siglip : clip_graph { + clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_pixtral : clip_graph { + clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_qwen2vl : clip_graph { + clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_qwen3vl : clip_graph { + clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_youtuvl : clip_graph { + clip_graph_youtuvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_minicpmv : clip_graph { + clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_internvl : clip_graph { + clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_llama4 : clip_graph { + clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_kimivl : clip_graph { + clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_cogvlm : clip_graph { + clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_llava : clip_graph { + clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_whisper_enc : clip_graph { + clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_conformer : clip_graph { + clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_glm4v : clip_graph { + clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_mobilenetv5 : clip_graph { + clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; + + ggml_tensor * rms_norm_2d( + ggml_tensor * inp, + ggml_tensor * weight, + float eps = 1e-6f); + + ggml_tensor* pad_same_2d( + ggml_tensor* inp, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int dilation_h = 1, + int dilation_w = 1); + + ggml_tensor * build_edge_residual( + ggml_tensor * inp, + const mobilenetv5_block & block, + int stride); + + ggml_tensor * build_inverted_residual( + ggml_tensor * inp, + const mobilenetv5_block & block, + int stride); + + ggml_tensor * build_mobilenet_attn( + ggml_tensor * inp, + const mobilenetv5_block & block); +}; + +struct clip_graph_kimik25 : clip_graph { + clip_graph_kimik25(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; + + ggml_tensor * resize_position_embeddings_3d(uint32_t interpolation_mode); +}; diff --git a/llama.cpp/tools/mtmd/models/pixtral.cpp b/llama.cpp/tools/mtmd/models/pixtral.cpp new file mode 100644 index 0000000..a849210 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/pixtral.cpp @@ -0,0 +1,86 @@ +#include "models.h" + +ggml_cgraph * clip_graph_pixtral::build() { + const int n_merge = hparams.n_merge; + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_RMS, + hparams.ffn_op, + nullptr, // no learned pos embd + add_pos); + + // mistral small 3.1 patch merger + // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 + if (model.mm_patch_merger_w) { + GGML_ASSERT(hparams.n_merge > 0); + + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); + + // reshape image tokens to 2D grid + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] + cur = ggml_cont(ctx0, cur); + + // torch.nn.functional.unfold is just an im2col under the hood + // we just need a dummy kernel to make it work + ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); + cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); + + // project to n_embd + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + } + + // LlavaMultiModalProjector (always using GELU activation) + { + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + } + + // arrangement of the [IMG_BREAK] token + if (model.token_embd_img_break) { + // not efficient, but works + // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows] + // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension + // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] + + const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; + const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; + const int p_total = p_x * p_y; + const int n_embd_text = cur->ne[0]; + const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row + + ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y); + ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y); + tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor + tok = ggml_add(ctx0, tok, model.token_embd_img_break); + tmp = ggml_concat(ctx0, tmp, tok, 1); + cur = ggml_view_2d(ctx0, tmp, + n_embd_text, n_tokens_output, + ggml_row_size(tmp->type, n_embd_text), 0); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/qwen2vl.cpp b/llama.cpp/tools/mtmd/models/qwen2vl.cpp new file mode 100644 index 0000000..85f158b --- /dev/null +++ b/llama.cpp/tools/mtmd/models/qwen2vl.cpp @@ -0,0 +1,183 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen2vl::build() { + GGML_ASSERT(model.patch_bias == nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const bool use_window_attn = hparams.n_wa_pattern > 0; + const int n_wa_pattern = hparams.n_wa_pattern; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL + ? NORM_TYPE_RMS // qwen 2.5 vl + : NORM_TYPE_NORMAL; // qwen 2 vl + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + ggml_tensor * inpL = inp; + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + if (use_window_attn) { + // handle window attention inputs + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // if flash attn is used, we need to pad the mask and cast to f16 + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + ggml_tensor * Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + ggml_tensor * Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + FFN_GELU, + -1); + + if (use_window_attn) { + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + + // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/qwen3vl.cpp b/llama.cpp/tools/mtmd/models/qwen3vl.cpp new file mode 100644 index 0000000..5ecb10f --- /dev/null +++ b/llama.cpp/tools/mtmd/models/qwen3vl.cpp @@ -0,0 +1,193 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen3vl::build() { + GGML_ASSERT(model.patch_bias != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = NORM_TYPE_NORMAL; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + // add patch bias + if (model.patch_bias != nullptr) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + + // calculate absolute position embedding and apply + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "inp_pos_emb", -1); + + ggml_tensor * inpL = inp; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] + ggml_tensor * deepstack_features = nullptr; + const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); + + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); + + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + if (layer.has_deepstack()) { + ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size); + feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il); + feat = build_ffn(feat, + layer.deepstack_fc1_w, layer.deepstack_fc1_b, + nullptr, nullptr, + layer.deepstack_fc2_w, layer.deepstack_fc2_b, + ffn_op_type::FFN_GELU, il); + + if(!deepstack_features) { + deepstack_features = feat; + } else { + // concat along the feature dimension + deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0); + } + } + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + ffn_op_type::FFN_GELU, -1); + + if (deepstack_features) { + embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); + } // concat along the feature dimension + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/siglip.cpp b/llama.cpp/tools/mtmd/models/siglip.cpp new file mode 100644 index 0000000..b866a11 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/siglip.cpp @@ -0,0 +1,86 @@ +#include "models.h" + +ggml_cgraph * clip_graph_siglip::build() { + ggml_tensor * inp = build_inp(); + + ggml_tensor * learned_pos_embd = model.position_embeddings; + if (proj_type == PROJECTOR_TYPE_LFM2) { + learned_pos_embd = resize_position_embeddings(); + } + + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + if (proj_type == PROJECTOR_TYPE_GEMMA3) { + const int batch_size = 1; + GGML_ASSERT(n_patches_x == n_patches_y); + const int patches_per_image = n_patches_x; + const int kernel_size = hparams.n_merge; + + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); + + // doing a pool2d to reduce the number of output tokens + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + // apply norm before projection + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + + // apply projection + cur = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), + cur); + + } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { + // pixel_shuffle + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + cur = ggml_mul_mat(ctx0, model.projection, cur); + + } else if (proj_type == PROJECTOR_TYPE_LFM2) { + // pixel unshuffle block + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection, in LFM2-VL input norm is optional + if (model.mm_input_norm_w) { + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + } + + if (model.mm_input_norm_b) { + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + } + + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + + } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) { + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + hparams.ffn_op, + -1); + + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/whisper-enc.cpp b/llama.cpp/tools/mtmd/models/whisper-enc.cpp new file mode 100644 index 0000000..2f2b127 --- /dev/null +++ b/llama.cpp/tools/mtmd/models/whisper-enc.cpp @@ -0,0 +1,115 @@ +#include "models.h" + +ggml_cgraph * clip_graph_whisper_enc::build() { + const int n_frames = img.nx; + const int n_pos = n_frames / 2; + GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); + + ggml_tensor * inp = build_inp_raw(1); + + // conv1d block + { + // convolution + gelu + ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1); + cur = ggml_add(ctx0, cur, model.conv1d_1_b); + + cur = ggml_gelu_erf(ctx0, cur); + + cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1); + cur = ggml_add(ctx0, cur, model.conv1d_2_b); + + cur = ggml_gelu_erf(ctx0, cur); + // transpose + inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cb(inp, "after_conv1d", -1); + } + + // sanity check (only check one layer, but it should be the same for all) + GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b); + GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b); + GGML_ASSERT(model.layers[0].q_b); + GGML_ASSERT(model.layers[0].v_b); + GGML_ASSERT(!model.layers[0].k_b); // no bias for k + + ggml_tensor * pos_embd_selected = ggml_view_2d( + ctx0, model.position_embeddings, + model.position_embeddings->ne[0], n_pos, + model.position_embeddings->nb[1], 0 + ); + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + pos_embd_selected, + nullptr); + + cb(cur, "after_transformer", -1); + + if (model.audio_has_stack_frames()) { + // StackAudioFrames + // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py + cur = build_stack(cur, hparams.proj_stack_factor, n_embd); + cb(cur, "after_stacked", -1); + } + + if (proj_type == PROJECTOR_TYPE_ULTRAVOX) { + // UltravoxProjector + // pre-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + + // ffn in + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + + // swiglu + // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half + cur = ggml_swiglu_swapped(ctx0, cur); + + // mid-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); + + // ffn out + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + + } else if (proj_type == PROJECTOR_TYPE_QWEN2A) { + // projector + cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); + cur = ggml_add(ctx0, cur, model.mm_fc_b); + + } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) { + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + + } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) { + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + + } else if (proj_type == PROJECTOR_TYPE_GLMA) { + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + cur = ggml_add(ctx0, cur, model.mm_norm_pre_b); + cur = build_stack(cur, hparams.proj_stack_factor, n_embd); + cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0); + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + } else { + GGML_ABORT("%s: unknown projector type", __func__); + } + + cb(cur, "projected", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/models/youtuvl.cpp b/llama.cpp/tools/mtmd/models/youtuvl.cpp new file mode 100644 index 0000000..ffbf2be --- /dev/null +++ b/llama.cpp/tools/mtmd/models/youtuvl.cpp @@ -0,0 +1,179 @@ +#include "models.h" + +ggml_cgraph * clip_graph_youtuvl::build() { + GGML_ASSERT(model.class_embedding == nullptr); + const int batch_size = 1; + const bool use_window_attn = !hparams.wa_layer_indexes.empty(); + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; + const int m = 2; + const int Wp = n_patches_x; + const int Hp = n_patches_y; + const int Hm = Hp / m; + const int Wm = Wp / m; + norm_type norm_t = NORM_TYPE_NORMAL; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp = build_inp_raw(); + + // change conv3d to linear + // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm) + { + inp = ggml_reshape_4d( + ctx0, inp, + Wm * m * patch_size, m * patch_size, Hm, 3); + inp = ggml_permute(ctx0, inp, 1, 2, 3, 0); + inp = ggml_cont_4d( + ctx0, inp, + m * patch_size * 3, Wm, m * patch_size, Hm); + + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_4d( + ctx0, inp, + m * patch_size * 3, patch_size, m, Hm * Wm); + + inp = ggml_permute(ctx0, inp, 1, 0, 2, 3); + inp = ggml_cont_4d( + ctx0, inp, + patch_size, 3, patch_size, Hm * Wm * m * m); + + inp = ggml_permute(ctx0, inp, 2, 0, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + 3*patch_size* patch_size, Hm * Wm * m * m, 1); + } + inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + } + + inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); + + ggml_tensor * inpL = inp; + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + if (use_window_attn) { + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // if flash attn is used, we need to pad the mask and cast to f16 + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + const bool full_attn = use_window_attn ? hparams.wa_layer_indexes.count(il) > 0 : true; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + // self-attention + { + ggml_tensor * Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + ggml_tensor * Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + ggml_tensor * Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + } + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + nullptr, nullptr, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + + inpL = cur; + } + + ggml_tensor * embeddings = inpL; + if (use_window_attn) { + const int spatial_merge_unit = 4; + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / spatial_merge_unit); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, n_embd * spatial_merge_unit, n_patches / spatial_merge_unit); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd, n_patches, batch_size); + cb(embeddings, "window_order_restored", -1); + } + + // post-layernorm (part of Siglip2VisionTransformer, applied after encoder) + if (model.post_ln_w) { + embeddings = build_norm(embeddings, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // Now apply merger (VLPatchMerger): + // 1. Apply RMS norm (ln_q in VLPatchMerger) + embeddings = build_norm(embeddings, model.mm_input_norm_w, nullptr, NORM_TYPE_RMS, 1e-6, -1); + cb(embeddings, "merger_normed", -1); + + // 2. First reshape for spatial merge (merge 2x2 patches) + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + cb(embeddings, "merger_reshaped", -1); + + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + FFN_GELU, + -1); + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/llama.cpp/tools/mtmd/mtmd-audio.cpp b/llama.cpp/tools/mtmd/mtmd-audio.cpp new file mode 100644 index 0000000..e8eef03 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-audio.cpp @@ -0,0 +1,730 @@ +#include "mtmd-audio.h" + +#define _USE_MATH_DEFINES // for M_PI +#include +#include +#include +#include +#include +#include +#include + +// some of the code here is copied from whisper.cpp + +constexpr bool DEBUG = false; + +void mtmd_audio_cache::fill_sin_cos_table(int n) { + sin_vals.resize(n); + cos_vals.resize(n); + for (int i = 0; i < n; i++) { + double theta = (2 * M_PI * i) / n; + sin_vals[i] = sinf(theta); + cos_vals[i] = cosf(theta); + } +} + +void mtmd_audio_cache::fill_hann_window(int length, bool periodic) { + hann_window.resize(length); + int offset = -1; + if (periodic) { + offset = 0; + } + for (int i = 0; i < length; i++) { + hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + } +} + +void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, + int n_fft, + int sample_rate, + float fmin, + float fmax, + bool slaney_area_norm, + float scale) { + GGML_ASSERT(n_mel > 0 && n_fft > 1); + if (fmax <= 0.0f) { + fmax = 0.5f * sample_rate; + } + + // Slaney scale (matches librosa default) + const double min_log_hz = 1000.0; + const double lin_slope = 3 / 200.; + const double min_log_mel = min_log_hz * lin_slope; + const double log_step = log(6.4) / 27.0; + auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double { + return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step; + }; + auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double { + return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step); + }; + + // infer N_fft from n_fft_bins + const double bin_hz_step = double(sample_rate) / double(n_fft); + + // mel grid: n_mel + 2 edges + const double m_lo = hz_to_mel(fmin); + const double m_hi = hz_to_mel(fmax); + std::vector mel_pts(n_mel + 2); + for (int i = 0; i < n_mel + 2; ++i) { + mel_pts[i] = m_lo + (m_hi - m_lo) * (double(i) / (n_mel + 1)); + } + + // convert to Hz + std::vector hz_pts(n_mel + 2); + for (int i = 0; i < n_mel + 2; ++i) { + hz_pts[i] = mel_to_hz(mel_pts[i]); + } + + const int n_fft_bins = n_fft / 2 + 1; + + // filterbank + std::vector out(n_mel * n_fft_bins, 0); + for (int m = 0; m < n_mel; ++m) { + const double f_left = hz_pts[m]; + const double f_center = hz_pts[m + 1]; + const double f_right = hz_pts[m + 2]; + + const double denom_l = std::max(1e-30, f_center - f_left); + const double denom_r = std::max(1e-30, f_right - f_center); + const double enorm = slaney_area_norm ? (2.0 / std::max(1e-30, f_right - f_left)) : 1.0; + + for (int k = 0; k < n_fft_bins; ++k) { + const double f = k * bin_hz_step; + double w = 0.0; + if (f >= f_left && f <= f_center) { + w = (f - f_left) / denom_l; + } else if (f > f_center && f <= f_right) { + w = (f_right - f) / denom_r; + } + out[size_t(m) * size_t(n_fft_bins) + size_t(k)] = float(w * enorm * scale); + } + } + + filters.n_mel = n_mel; + filters.n_fft = n_fft; + filters.data = std::move(out); + + if (DEBUG) { // debug + for (size_t i = 0; i < filters.data.size(); ++i) { + if (filters.data[i] != 0.0f) { + printf("filters[%zu] = %f\n", i, filters.data[i] * 1000.0f); + } + } + } +} + +// Unified DFT implementation for both forward and inverse transforms +// Template parameters: +// Inverse: false = DFT with exp(-2πi·k·n/N), no scaling +// true = IDFT with exp(+2πi·k·n/N), scales by 1/N +// RealInput: true = input is real-valued (stride 1), avoids imaginary computations +// false = input is complex-valued (interleaved real/imag, stride 2) +template +static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, float * out) { + const int n_sin_cos_vals = cache.sin_vals.size(); + const int sin_cos_step = n_sin_cos_vals / N; + + constexpr float sign = Inverse ? 1.0f : -1.0f; + const float scale = Inverse ? (1.0f / N) : 1.0f; + + for (int k = 0; k < N; k++) { + float re = 0; + float im = 0; + + for (int n = 0; n < N; n++) { + int idx = (k * n * sin_cos_step) % n_sin_cos_vals; + float cos_val = cache.cos_vals[idx]; + float sin_val = cache.sin_vals[idx]; + + if constexpr (RealInput) { + // Real input: in_im = 0, simplifies to: + // re += in_re * cos_val + // im += sign * in_re * sin_val + float in_re = in[n]; + re += in_re * cos_val; + im += sign * in_re * sin_val; + } else { + float in_re = in[n * 2 + 0]; + float in_im = in[n * 2 + 1]; + // (a + bi) * (cos + sign*i*sin) = (a*cos - sign*b*sin) + (sign*a*sin + b*cos)i + re += in_re * cos_val - sign * in_im * sin_val; + im += sign * in_re * sin_val + in_im * cos_val; + } + } + + out[k * 2 + 0] = re * scale; + out[k * 2 + 1] = im * scale; + } +} + +// Cooley-Tukey FFT/IFFT unified implementation +// Template parameters: +// Inverse: false = FFT with exp(-2πi·k/N), no scaling +// true = IFFT with exp(+2πi·k/N), scales by 0.5 at each level +// RealInput: true = input is real-valued (stride 1) +// false = input is complex-valued (interleaved real/imag, stride 2) +template +static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) { + const int n_sin_cos_vals = cache.sin_vals.size(); + + if (N == 1) { + out[0] = in[0]; + if constexpr (RealInput) { + out[1] = 0.0f; + } else { + out[1] = in[1]; + } + return; + } + + const int half_N = N / 2; + if (N - half_N * 2 == 1) { + // Odd N: fall back to DFT + dft_impl(cache, in, N, out); + return; + } + + // Split into even and odd + if constexpr (RealInput) { + // Real input: stride is 1, copy only real values + float * even = in + N; + for (int i = 0; i < half_N; ++i) { + even[i] = in[2 * i]; + } + float * even_fft = out + 2 * N; + fft_impl(cache, even, half_N, even_fft); + + float * odd = even; + for (int i = 0; i < half_N; ++i) { + odd[i] = in[2 * i + 1]; + } + float * odd_fft = even_fft + N; + fft_impl(cache, odd, half_N, odd_fft); + } else { + // Complex input: stride is 2, copy complex pairs + float * even = in + N * 2; + for (int i = 0; i < half_N; ++i) { + even[i * 2 + 0] = in[2 * i * 2 + 0]; + even[i * 2 + 1] = in[2 * i * 2 + 1]; + } + float * even_fft = out + 2 * N; + fft_impl(cache, even, half_N, even_fft); + + float * odd = even; + for (int i = 0; i < half_N; ++i) { + odd[i * 2 + 0] = in[(2 * i + 1) * 2 + 0]; + odd[i * 2 + 1] = in[(2 * i + 1) * 2 + 1]; + } + float * odd_fft = even_fft + N; + fft_impl(cache, odd, half_N, odd_fft); + } + + float * even_fft = out + 2 * N; + float * odd_fft = even_fft + N; + + const int sin_cos_step = n_sin_cos_vals / N; + + constexpr float sign = Inverse ? 1.0f : -1.0f; + constexpr float scale = Inverse ? 0.5f : 1.0f; + + for (int k = 0; k < half_N; k++) { + int idx = k * sin_cos_step; // t = 2*M_PI*k/N + float re = cache.cos_vals[idx]; + float im = sign * cache.sin_vals[idx]; + + float re_odd = odd_fft[2 * k + 0]; + float im_odd = odd_fft[2 * k + 1]; + + out[2 * k + 0] = scale * (even_fft[2 * k + 0] + re * re_odd - im * im_odd); + out[2 * k + 1] = scale * (even_fft[2 * k + 1] + re * im_odd + im * re_odd); + + out[2 * (k + half_N) + 0] = scale * (even_fft[2 * k + 0] - re * re_odd + im * im_odd); + out[2 * (k + half_N) + 1] = scale * (even_fft[2 * k + 1] - re * im_odd - im * re_odd); + } +} + +// Forward FFT for real input (used by mel spectrogram) +static void fft(const mtmd_audio_cache & cache, float * in, int N, float * out) { + fft_impl(cache, in, N, out); +} + +// Inverse FFT for complex input +static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) { + fft_impl(cache, in, N, out); +} + +struct filter_params { + int32_t n_mel; + int32_t n_fft_bins; + int32_t hann_window_size; + int32_t hop_length; + int32_t sample_rate; + bool center_padding = false; + float preemph = 0.f; + bool use_natural_log = false; + bool norm_per_feature = false; +}; + +static void log_mel_spectrogram_worker_thread(int ith, + const float * hann, + const std::vector & samples, + int n_samples, + int frame_size, + int frame_step, + int n_threads, + const filter_params & params, + const mtmd_audio_cache & cache, + mtmd_audio_mel & out) { + std::vector fft_in(frame_size * 2, 0.0); + std::vector fft_out(frame_size * 2 * 2 * 2); + + int n_fft_bins = params.n_fft_bins; + int i = ith; + + const auto & filters = cache.filters; + + // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist + GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2)); + GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size()); + // calculate FFT only when fft_in are not all zero + for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) { + const int offset = i * frame_step; + + // apply Hann window (~10% faster) + for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { + fft_in[j] = hann[j] * samples[offset + j]; + } + + // fill the rest with zeros + if (n_samples - offset < frame_size) { + std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); + } + + // FFT + fft(cache, fft_in.data(), frame_size, fft_out.data()); + + // Calculate modulus^2 of complex numbers + // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. + for (int j = 0; j < n_fft_bins; j++) { + fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + } + + // mel spectrogram + for (int j = 0; j < out.n_mel; j++) { + double sum = 0.0; + // unroll loop (suggested by GH user @lunixbochs) + int k = 0; + for (k = 0; k < n_fft_bins - 3; k += 4) { + size_t idx = size_t(j) * size_t(n_fft_bins) + size_t(k); + sum += + fft_out[k + 0] * filters.data[idx + 0] + + fft_out[k + 1] * filters.data[idx + 1] + + fft_out[k + 2] * filters.data[idx + 2] + + fft_out[k + 3] * filters.data[idx + 3]; + } + // handle n_fft remainder + for (; k < n_fft_bins; k++) { + sum += fft_out[k] * filters.data[j * n_fft_bins + k]; + } + sum = params.use_natural_log + ? log(sum + 5.960464477539063e-08) + : log10(std::max(sum, 1e-10)); + out.data[j * out.n_len + i] = sum; + } + } + + // Otherwise fft_out are all zero + double sum = params.use_natural_log ? log(1e-10) : log10(1e-10); + for (; i < out.n_len; i += n_threads) { + for (int j = 0; j < out.n_mel; j++) { + out.data[j * out.n_len + i] = sum; + } + } +} + +// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 +static bool log_mel_spectrogram( + const float * samples, + const int n_samples_in, + const int n_threads, + const filter_params & params, + const mtmd_audio_cache & cache, + mtmd_audio_mel & out) { + //const int64_t t_start_us = ggml_time_us(); + + out.n_len_org = n_samples_in; + int n_samples = n_samples_in; + + // Hann window + const float * hann = cache.hann_window.data(); + const int frame_size = (params.n_fft_bins - 1) * 2; + const int frame_step = params.hop_length; + + // Padding + std::vector samples_padded; + if (params.center_padding) { + const auto pad_amount = frame_size / 2; + samples_padded = std::vector(n_samples + 2 * pad_amount, 0); + std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount); + samples = samples_padded.data(); + n_samples = samples_padded.size(); + } else { + // existing padding logic + int64_t stage_1_pad = params.sample_rate * 30; + int64_t stage_2_pad = frame_size / 2; + samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); + std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); + // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio + std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); + // reflective pad 200 samples at the beginning of audio + if (n_samples < stage_2_pad + 1) { + // TODO: Handle short audio differently or return error + return false; + } + std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); + } + + // preemphasis + if (params.preemph) { + const int pad_amount = frame_size / 2; + const float preemph = 0.97f; + float prev = samples_padded[pad_amount]; + for (int i = pad_amount + 1; i + pad_amount < n_samples; ++i) { + float cur = samples_padded[i]; + samples_padded[i] = cur - preemph * prev; + prev = cur; + } + } + + // pad hann window if it's smaller than frame_size + // TODO: probably unnecessary here? (or better doing it in g_cache?) + std::vector hann_window_padded; + if (params.hann_window_size < frame_size) { + hann_window_padded.resize(frame_size); + const int padding = (frame_size - params.hann_window_size) / 2; + std::copy(hann, hann + params.hann_window_size, &hann_window_padded[padding]); + hann = hann_window_padded.data(); + } + + + out.n_mel = params.n_mel; + out.n_len = (n_samples - frame_size) / frame_step + 1; + // TODO: handle these checks better + if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) { + LOG_ERR("%s: size overflow\n", __func__); + return false; + } + if (n_samples < frame_size) { + LOG_ERR("%s: not enough samples after padding\n", __func__); + return false; + } + out.data.resize(out.n_mel * out.n_len); + + { + std::vector workers(n_threads - 1); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw] = + std::thread(log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded), n_samples, + frame_size, frame_step, n_threads, std::cref(params), std::cref(cache), std::ref(out)); + } + + // main thread + log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples, frame_size, frame_step, n_threads, params, + cache, out); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw].join(); + } + } + + const int effective_n_len = n_samples_in / frame_step; + if (params.norm_per_feature) { + for (int i = 0; i < out.n_mel; i++) { + double mean = 0; + for (int j = 0; j < effective_n_len; ++j) { + mean += out.data[i * out.n_len + j]; + } + mean /= effective_n_len; + + double var = 0.0; + for (int j = 0; j < effective_n_len; ++j) { + const double value = out.data[i * out.n_len + j] - mean; + var += value * value; + } + var /= effective_n_len - 1; // unbiased + const double mstd = std::sqrt(var + 1e-5); + + for (int j = 0; j < effective_n_len; ++j) { + auto &value = out.data[i * out.n_len + j]; + value = (value - mean) / mstd; + } + + // pad the rest with zeros + for (int j = effective_n_len; j < out.n_len; ++j) { + out.data[i * out.n_len + j] = 0.0; + } + } + } else { + // clamping and normalization + double mmax = -1e20; + for (int i = 0; i < out.n_mel*out.n_len; i++) { + if (out.data[i] > mmax) { + mmax = out.data[i]; + } + } + + mmax -= 8.0; + + for (int i = 0; i < out.n_mel*out.n_len; i++) { + if (out.data[i] < mmax) { + out.data[i] = mmax; + } + out.data[i] = (out.data[i] + 4.0)/4.0; + } + } + + // Dump log_mel_spectrogram + if (DEBUG) { + std::ofstream outFile("log_mel_spectrogram.json"); + outFile << "["; + for (uint64_t i = 0; i < out.data.size() - 1; i++) { + outFile << out.data[i] << ", "; + } + outFile << out.data[out.data.size() - 1] << "]"; + outFile.close(); + } + + return true; +} + +// +// mtmd_audio_preprocessor_whisper +// + +void mtmd_audio_preprocessor_whisper::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + cache.fill_hann_window(hparams.audio_window_len, true); + cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); +} + +bool mtmd_audio_preprocessor_whisper::preprocess(const float * samples, + size_t n_samples, + std::vector & output) { + if (n_samples == 0) { + // empty audio + return false; + } + + std::vector smpl; + // if input is too short, pad with zeros + // this is to avoid potential issues with stage1/2 padding in log_mel_spectrogram + // TODO: maybe handle this better + size_t min_samples = (size_t) hparams.audio_sample_rate * (hparams.audio_chunk_len + 1); // +1 second margin + if (n_samples < min_samples) { + smpl.resize(min_samples, 0.0f); + std::memcpy(smpl.data(), samples, n_samples * sizeof(float)); + samples = smpl.data(); + n_samples = smpl.size(); + } + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.center_padding = false; + params.preemph = 0.0f; // disabled + params.use_natural_log = false; + params.norm_per_feature = false; + + // make sure the cache is initialized + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + mtmd_audio_mel out_full; + bool ok = log_mel_spectrogram(samples, n_samples, + 4, // n_threads + params, cache, out_full); + if (!ok) { + return false; + } + + // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel + // we always expect the mel to have 3000 silent frames at the end + if (DEBUG) { + printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len); + } + const size_t frames_per_chunk = 3000; + GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk); + for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) { + int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off); + if ((size_t) n_len < frames_per_chunk) { + break; // last uncomplete chunk will always be a padded chunk, safe to ignore + } + + mtmd_audio_mel out_chunk; + out_chunk.n_len = n_len; + out_chunk.n_mel = out_full.n_mel; + out_chunk.n_len_org = out_full.n_mel; // unused + out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len); + + for (int i = 0; i < out_full.n_mel; i++) { + auto src = out_full.data.begin() + i * out_full.n_len + off; + out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk); + } + + output.push_back(std::move(out_chunk)); + } + + return true; +} + +// +// mtmd_audio_preprocessor_conformer +// + +void mtmd_audio_preprocessor_conformer::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + cache.fill_hann_window(hparams.audio_window_len, true); + cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate); +} + +bool mtmd_audio_preprocessor_conformer::preprocess(const float * samples, + size_t n_samples, + std::vector & output) { + // empty audio + if (n_samples == 0) { + return false; + } + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.center_padding = true; + params.preemph = 0.97f; + params.use_natural_log = true; + params.norm_per_feature = true; + + // make sure the cache is initialized + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + mtmd_audio_mel out_full; + bool ok = log_mel_spectrogram(samples, n_samples, + 4, // n_threads + params, cache, out_full); + if (!ok) { + return false; + } + + output.push_back(std::move(out_full)); + return true; +} + +// +// mtmd_audio_streaming_istft implementation +// + +mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length) : + n_fft(n_fft), + hop_length(hop_length), + n_fft_bins(n_fft / 2 + 1), + overlap_buffer(n_fft, 0.0f), + window_sum_buffer(n_fft, 0.0f), + padding_to_remove((n_fft - hop_length) / 2), + ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT + ifft_out(n_fft * 2 * 4, 0.0f) { + cache.fill_sin_cos_table(n_fft); + cache.fill_hann_window(n_fft, true); +} + +void mtmd_audio_streaming_istft::reset() { + std::fill(overlap_buffer.begin(), overlap_buffer.end(), 0.0f); + std::fill(window_sum_buffer.begin(), window_sum_buffer.end(), 0.0f); + padding_to_remove = (n_fft - hop_length) / 2; +} + +std::vector mtmd_audio_streaming_istft::process_frame(const float * frame_spectrum) { + std::vector output(hop_length); + + // copy frequencies + for (int j = 0; j < n_fft_bins; j++) { + ifft_in[j * 2 + 0] = frame_spectrum[j * 2 + 0]; + ifft_in[j * 2 + 1] = frame_spectrum[j * 2 + 1]; + } + + // mirror negative frequencies + for (int j = 1; j < n_fft_bins - 1; j++) { + int mirror_idx = n_fft - j; + ifft_in[mirror_idx * 2 + 0] = ifft_in[j * 2 + 0]; + ifft_in[mirror_idx * 2 + 1] = -ifft_in[j * 2 + 1]; // conjugate + } + + ifft(cache, ifft_in.data(), n_fft, ifft_out.data()); + + // update window sum and overlap buffer + for (int j = 0; j < n_fft; j++) { + window_sum_buffer[j] += cache.hann_window[j] * cache.hann_window[j]; + overlap_buffer[j] += ifft_out[j * 2] * cache.hann_window[j]; + } + + // extract hop_length samples with normalization + for (int i = 0; i < hop_length; i++) { + if (window_sum_buffer[i] > 1e-8f) { + output[i] = overlap_buffer[i] / window_sum_buffer[i]; + } else { + output[i] = overlap_buffer[i]; + } + } + + // shift buffers left by hop_length + std::copy(overlap_buffer.begin() + hop_length, overlap_buffer.end(), overlap_buffer.begin()); + std::fill(overlap_buffer.end() - hop_length, overlap_buffer.end(), 0.0f); + + std::copy(window_sum_buffer.begin() + hop_length, window_sum_buffer.end(), window_sum_buffer.begin()); + std::fill(window_sum_buffer.end() - hop_length, window_sum_buffer.end(), 0.0f); + + // Remove padding if needed + int to_remove = std::min(padding_to_remove, (int) output.size()); + padding_to_remove -= to_remove; + output.erase(output.begin(), output.begin() + to_remove); + + return output; +} + +std::vector mtmd_audio_streaming_istft::flush() { + std::vector output; + + // Extract remaining samples from overlap buffer + // Continue until we've extracted all meaningful samples + int remaining = n_fft - hop_length; + while (remaining > 0) { + int chunk_size = std::min(remaining, hop_length); + + for (int i = 0; i < chunk_size; i++) { + float sample; + if (window_sum_buffer[i] > 1e-8f) { + sample = overlap_buffer[i] / window_sum_buffer[i]; + } else { + sample = overlap_buffer[i]; + } + output.push_back(sample); + } + + // Shift buffers + std::copy(overlap_buffer.begin() + chunk_size, overlap_buffer.end(), overlap_buffer.begin()); + std::fill(overlap_buffer.end() - chunk_size, overlap_buffer.end(), 0.0f); + + std::copy(window_sum_buffer.begin() + chunk_size, window_sum_buffer.end(), window_sum_buffer.begin()); + std::fill(window_sum_buffer.end() - chunk_size, window_sum_buffer.end(), 0.0f); + + remaining -= chunk_size; + } + + return output; +} diff --git a/llama.cpp/tools/mtmd/mtmd-audio.h b/llama.cpp/tools/mtmd/mtmd-audio.h new file mode 100644 index 0000000..016c739 --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-audio.h @@ -0,0 +1,113 @@ +#pragma once + +#include "ggml.h" +#include "clip-model.h" + +#include +#include +#include + +#define MTMD_INTERNAL_HEADER + +struct mtmd_audio_mel { + int n_len; + int n_len_org; + int n_mel; + + std::vector data; +}; + +struct mtmd_audio_mel_filters { + int32_t n_mel; + int32_t n_fft; + + std::vector data; +}; + +// cache for audio processing, each processor instance owns its own cache +struct mtmd_audio_cache { + std::vector sin_vals; + std::vector cos_vals; + + std::vector hann_window; + + mtmd_audio_mel_filters filters; + + void fill_sin_cos_table(int n); + + void fill_hann_window(int length, bool periodic); + + // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime. + // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257. + void fill_mel_filterbank_matrix(int n_mel, + int n_fft, + int sample_rate, // e.g. 16000 + float fmin = 0.0f, // e.g. 0.0 + float fmax = -1.0f, // e.g. sr/2; pass -1 for auto + bool slaney_area_norm = true, + float scale = 1.0f // optional extra scaling + ); +}; + +struct mtmd_audio_preprocessor { + const clip_hparams & hparams; + + mtmd_audio_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {} + + virtual ~mtmd_audio_preprocessor() = default; + virtual void initialize() = 0; // NOT thread-safe + virtual bool preprocess(const float * samples, size_t n_samples, std::vector & output) = 0; +}; + +struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_whisper(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; + + private: + mtmd_audio_cache cache; +}; + +struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; + + private: + mtmd_audio_cache cache; +}; + +// +// streaming ISTFT - converts spectrogram frames back to audio one frame at a time +// +struct mtmd_audio_streaming_istft { + mtmd_audio_streaming_istft(int n_fft, int hop_length); + + // reset streaming state + void reset(); + + // process a single STFT frame (streaming) + // frame_spectrum: [n_fft_bins x 2] interleaved real/imag + // returns: up to hop_length samples + std::vector process_frame(const float * frame_spectrum); + + // flush remaining samples at end of stream + std::vector flush(); + + private: + int n_fft; + int hop_length; + int n_fft_bins; + + // Own cache for output processing + mtmd_audio_cache cache; + + // Streaming state + std::vector overlap_buffer; + std::vector window_sum_buffer; + int padding_to_remove; + + // Working buffers for IFFT + std::vector ifft_in; + std::vector ifft_out; +}; diff --git a/llama.cpp/tools/mtmd/mtmd-cli.cpp b/llama.cpp/tools/mtmd/mtmd-cli.cpp new file mode 100644 index 0000000..054c7fa --- /dev/null +++ b/llama.cpp/tools/mtmd/mtmd-cli.cpp @@ -0,0 +1,437 @@ +#include "arg.h" +#include "debug.h" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "llama.h" +#include "ggml.h" +#include "console.h" +#include "chat.h" +#include "mtmd.h" +#include "mtmd-helper.h" + +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +// volatile, because of signal being an interrupt +static volatile bool g_is_generating = false; +static volatile bool g_is_interrupted = false; + +/** + * Please note that this is NOT a production-ready stuff. + * It is a playground for trying multimodal support in llama.cpp. + * For contributors: please keep this code simple and easy to understand. + */ + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG( + "Experimental CLI for multimodal\n\n" + "Usage: %s [options] -m --mmproj --image --audio