diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..31d811a391ea684b51aed961d2312b308fc910da --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +BSD 2-Clause License + +Copyright (c) 2026, Mitja Felicijan + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2d18a8d987259d257429d61554445a2bf940876d --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ +# llm.vim + +Asynchronous Vim plugin for Ollama LLMs. + +## Features +- **Code Completion & Refactoring (`l`)**: Replaces or appends code using the File Skeleton and Local Context as prompts. +- **Conversational Questions (`k`)**: Opens a vertical right-split scratch buffer (`ft=markdown`) for queries. +- **Model Management (`:LLMModel`)**: Command with tab-completion for switching `g:llm_model` based on local Ollama tags. +- **Asynchronous Execution**: Uses Vim 8 jobs and temporary files for non-blocking I/O. +- **Indentation Aware**: Uses Vim's `=` operator on inserted code. + +## Installation +```vim +Plug 'mitjafelicijan/llm.vim' +``` + +## Configuration +Override defaults in your `.vimrc`: + +Check for available models https://ollama.com/search. + +```vim +let g:llm_model = 'granite4.1:3b' +let g:llm_url = 'http://127.0.0.1:11434' +``` + +### Custom Mappings +To disable default mappings and define your own, set `g:llm_disable_mappings` and use the following functions: + +```vim +let g:llm_disable_mappings = 1 + +" Normal mode question +nnoremap :call llm#AskQuestion(0) +" Visual mode question +xnoremap :call llm#AskQuestion(1) +" Visual mode completion +xnoremap :call llm#RequestCompletion(1) +``` + +## Mappings & Commands (Defaults) +- `l` (Visual): Refactor or replace selection. +- `k` (Normal/Visual): Open `[LLM Answer]` buffer on the right. +- `:LLMModel`: Switch `g:llm_model`. Supports `` completion via Ollama API. diff --git a/autoload/llm.vim b/autoload/llm.vim new file mode 100644 index 0000000000000000000000000000000000000000..32fe966caf6956fcac296199c0d2a699378a41fd --- /dev/null +++ b/autoload/llm.vim @@ -0,0 +1,296 @@ +let g:llm_model = get(g:, 'llm_model', 'falcon3:latest') +let g:llm_url = get(g:, 'llm_url', 'http://127.0.0.1:11434') +let g:llm_scratch_bufnr = -1 + +" System Prompts +let g:llm_system_prompt_code = get(g:, 'llm_system_prompt_code', "Output ONLY raw code. No markdown. No commentary. Do not include comments. Keep logical units on a single line; do not split statements or function calls into multiple lines.") +let g:llm_system_prompt_question = get(g:, 'llm_system_prompt_question', "You are a helpful assistant. Clear and concise.") +let g:llm_visual_task_prompt = get(g:, 'llm_visual_task_prompt', "TASK: Replace the following selection. Output ONLY raw replacement code. No markdown formatting. No backticks. No commentary. Do not include comments. Constraint: Do not split statements or function calls into multiple lines; keep logical units on a single line.") + +function! llm#GetOrCreateScratch() + if g:llm_scratch_bufnr != -1 && bufexists(g:llm_scratch_bufnr) + let l:winid = bufwinid(g:llm_scratch_bufnr) + if l:winid != -1 + call win_gotoid(l:winid) + else + " execute 'topleft split' + execute 'vertical botright split' + execute 'buffer ' . g:llm_scratch_bufnr + endif + else + " silent execute 'topleft 20new' + silent execute 'vertical botright 80new' + setlocal buftype=nofile bufhidden=hide noswapfile wrap + let g:llm_scratch_bufnr = bufnr('%') + silent! execute 'file [LLM Answer]' + endif + + setlocal wrap filetype=markdown + return g:llm_scratch_bufnr +endfunction + +function! llm#GetStyleInstructions() + let l:indent_type = &expandtab ? "spaces" : "tabs" + let l:indent_width = &shiftwidth == 0 ? &tabstop : &shiftwidth + return printf("Style: Use %d %s for indentation. Match the project's naming convention.", l:indent_width, l:indent_type) +endfunction + +function! llm#GetSurroundingContext(line_start, line_end) + let l:top = getline(1, 20) + let l:local_start = max([1, a:line_start - 20]) + let l:local_end = max([1, a:line_start - 1]) + let l:local = getline(l:local_start, l:local_end) + + let l:ctx = "[File Skeleton]\n" . join(l:top, "\n") . "\n...\n" + let l:ctx .= "[Local Context]\n" . join(l:local, "\n") + return l:ctx +endfunction + +function! llm#HandleCodeExit(context, job, status) + let l:err_msg = "" + if filereadable(a:context.err_file) + let l:err_msg = join(readfile(a:context.err_file), " ") + endif + + if a:status != 0 + redraw | echoerr "LLM Request Failed (" . a:status . "): " . l:err_msg + call delete(a:context.res_file) + call delete(a:context.req_file) + call delete(a:context.err_file) + return + endif + + if !filereadable(a:context.res_file) + redraw | echoerr "LLM: No response file found" + call delete(a:context.req_file) + call delete(a:context.err_file) + return + endif + + let l:content = join(readfile(a:context.res_file, 'b'), "") + + " Clean up temp files + call delete(a:context.res_file) + call delete(a:context.req_file) + call delete(a:context.err_file) + + if empty(l:content) + redraw | echoerr "LLM: Empty response" + return + endif + + try + let l:json = json_decode(l:content) + if type(l:json) == v:t_dict && has_key(l:json, 'response') + let l:response = l:json['response'] + let l:response = substitute(l:response, '^```\w*\n', '', '') + let l:response = substitute(l:response, '\n```$', '', '') + let l:response = substitute(l:response, '^```', '', '') + let l:response = substitute(l:response, '```$', '', '') + let l:out_lines = split(l:response, "\n") + + if bufexists(a:context.bufnr) + let l:start_line = 0 + let l:end_line = 0 + + if a:context.is_replacing + call deletebufline(a:context.bufnr, a:context.range_start, a:context.range_end) + call appendbufline(a:context.bufnr, a:context.range_start - 1, l:out_lines) + let l:start_line = a:context.range_start + let l:end_line = a:context.range_start + len(l:out_lines) - 1 + else + call appendbufline(a:context.bufnr, a:context.insert_pos, l:out_lines) + let l:start_line = a:context.insert_pos + 1 + let l:end_line = a:context.insert_pos + len(l:out_lines) + endif + + " Apply auto-formatting + let l:winid = bufwinid(a:context.bufnr) + if l:winid != -1 && l:start_line <= l:end_line + call win_execute(l:winid, 'normal! ' . l:start_line . 'G=' . l:end_line . 'G') + endif + endif + redraw | echo "LLM: Code Updated" + elseif type(l:json) == v:t_dict && has_key(l:json, 'error') + redraw | echoerr "LLM Error: " . l:json.error + else + redraw | echoerr "LLM: Unexpected JSON structure" + endif + catch + redraw | echoerr "LLM: Failed to parse response. Content starts with: " . strpart(l:content, 0, 100) + endtry +endfunction + +function! llm#HandleQuestionExit(context, job, status) + let l:err_msg = "" + if filereadable(a:context.err_file) + let l:err_msg = join(readfile(a:context.err_file), " ") + endif + + if a:status != 0 + redraw | echoerr "LLM Request Failed (" . a:status . "): " . l:err_msg + call delete(a:context.res_file) + call delete(a:context.req_file) + call delete(a:context.err_file) + return + endif + + if !filereadable(a:context.res_file) + redraw | echoerr "LLM: No response file found" + call delete(a:context.req_file) + call delete(a:context.err_file) + return + endif + + let l:content = join(readfile(a:context.res_file, 'b'), "") + + " Clean up temp files + call delete(a:context.res_file) + call delete(a:context.req_file) + call delete(a:context.err_file) + + if !empty(l:content) + try + let l:json = json_decode(l:content) + if type(l:json) == v:t_dict && has_key(l:json, 'response') + if bufexists(g:llm_scratch_bufnr) + call appendbufline(g:llm_scratch_bufnr, '$', split(l:json['response'], "\n")) + let l:winid = bufwinid(g:llm_scratch_bufnr) + if l:winid != -1 + call win_execute(l:winid, 'normal! gg') + endif + endif + redraw | echo "LLM: Answer Received" + elseif type(l:json) == v:t_dict && has_key(l:json, 'error') + redraw | echoerr "LLM Error: " . l:json.error + endif + catch + redraw | echoerr "LLM: Failed to parse response" + endtry + endif +endfunction + +function! llm#StartJob(prompt, context, exit_cb) + redraw | echo "LLM: Thinking..." + let l:payload_dict = { + \ 'model': g:llm_model, + \ 'prompt': a:prompt, + \ 'stream': v:false + \ } + + let l:payload = json_encode(l:payload_dict) + let a:context.req_file = tempname() + let a:context.res_file = tempname() + let a:context.err_file = tempname() + + call writefile([l:payload], a:context.req_file, 'b') + + " Use native job_start redirection instead of shell pipes + " Added -f to curl to return non-zero exit code on HTTP errors + let l:cmd = ['curl', '-sfS', '-X', 'POST', g:llm_url . '/api/generate', + \ '--connect-timeout', '10', + \ '-d', '@' . a:context.req_file] + + let l:job = job_start(l:cmd, { + \ 'exit_cb': function(a:exit_cb, [a:context]), + \ 'out_io': 'file', + \ 'out_name': a:context.res_file, + \ 'err_io': 'file', + \ 'err_name': a:context.err_file + \ }) +endfunction + +function! llm#RequestCompletion(is_visual) + let l:context = {'mode': 'code', 'bufnr': bufnr('%')} + let l:file_info = "File: " . expand('%:t') . " (type: " . &filetype . ")" + let l:style = llm#GetStyleInstructions() + let l:system_prompt = g:llm_system_prompt_code . "\n" . l:style + + if a:is_visual + let l:context.range_start = getpos("'<")[1] + let l:context.range_end = getpos("'>")[1] + let l:selection = join(getline(l:context.range_start, l:context.range_end), "\n") + let l:instruction = input("Instruction: ") + redraw + + if empty(l:instruction) && empty(l:selection) | return | endif + + let l:mod_prompt = g:llm_visual_task_prompt . "\n" + if l:instruction =~? 'refactor' | let l:mod_prompt .= "Instruction: Refactor for efficiency/readability.\n" | endif + if l:instruction =~? 'fix\|bug' | let l:mod_prompt .= "Instruction: Fix bugs/errors.\n" | endif + if l:instruction =~? 'doc\|comment' | let l:mod_prompt .= "Instruction: Add documentation.\n" | endif + if !empty(l:instruction) | let l:mod_prompt .= "Additional Instruction: " . l:instruction . "\n" | endif + + let l:surround = llm#GetSurroundingContext(l:context.range_start, l:context.range_end) + let l:suffix = join(getline(l:context.range_end + 1, min([l:context.range_end + 100, line('$')])), "\n") + + let l:prompt = l:file_info . "\n" . l:style . "\n\n" + let l:prompt .= "[CONTEXT - FOR REFERENCE ONLY]\n" . l:surround . "\n" + if !empty(l:suffix) | let l:prompt .= "[CODE AFTER SELECTION - FOR REFERENCE ONLY]\n" . l:suffix . "\n" | endif + let l:prompt .= "\n" . l:mod_prompt + let l:prompt .= "\n[CODE TO REPLACE]\n" . l:selection . "\n\nREPLACEMENT CODE:" + + let l:context.is_replacing = 1 + endif + + call llm#StartJob(l:prompt, l:context, 'llm#HandleCodeExit') +endfunction + +function! llm#AskQuestion(is_visual) + let l:selection = "" + let l:line_start = line('.') + let l:line_end = line('.') + if a:is_visual + let l:line_start = getpos("'<")[1] + let l:line_end = getpos("'>")[1] + let l:selection = join(getline(l:line_start, l:line_end), "\n") + endif + + let l:question = input("Ask: ") + redraw + if empty(l:question) | return | endif + + let l:file_info = "File: " . expand('%:t') . " (type: " . &filetype . ")" + let l:surround = llm#GetSurroundingContext(l:line_start, l:line_end) + let l:prompt = l:file_info . "\n" . g:llm_system_prompt_question . "\n\n" . l:surround + if !empty(l:selection) | let l:prompt .= "\n\nContext Selection:\n" . l:selection | endif + let l:prompt .= "\n\nQuestion: " . l:question + + let l:bufnr = llm#GetOrCreateScratch() + silent call deletebufline(l:bufnr, 1, '$') + call setbufline(l:bufnr, 1, ["Question: " . l:question, "Model: " . g:llm_model, ""]) + let l:winid = bufwinid(l:bufnr) + if l:winid != -1 | call win_execute(l:winid, 'normal! gg') | endif + + call llm#StartJob(l:prompt, {'mode': 'question'}, 'llm#HandleQuestionExit') +endfunction + +function! llm#ListModels() + let l:cmd = 'curl -sS --connect-timeout 2 ' . g:llm_url . '/api/tags' + let l:output = system(l:cmd) + if v:shell_error != 0 + return [] + endif + try + let l:json = json_decode(l:output) + if type(l:json) == v:t_dict && has_key(l:json, 'models') + return map(l:json.models, 'v:val.name') + endif + catch + endtry + return [] +endfunction + +function! llm#CompleteModel(ArgLead, CmdLine, CursorPos) + let l:models = llm#ListModels() + return filter(l:models, 'v:val =~# "^" . a:ArgLead') +endfunction + +function! llm#SwitchModel(model) + if empty(a:model) + echo "LLM: Current model is " . g:llm_model + return + endif + let g:llm_model = a:model + echo "LLM: Model set to " . g:llm_model +endfunction diff --git a/doc/llm.txt b/doc/llm.txt new file mode 100644 index 0000000000000000000000000000000000000000..deee931fd12bfc05a421b1c4335d2840cccba29f --- /dev/null +++ b/doc/llm.txt @@ -0,0 +1,73 @@ +*llm.txt* Asynchronous LLM interaction for Vim + +============================================================================== +INTRODUCTION *llm-introduction* + +llm.vim is a plugin that allows you to interact with local Large Language +Models via Ollama. It supports code completion, refactoring, and general +questions, all handled asynchronously to keep Vim responsive. + +============================================================================== +MAPPINGS *llm-mappings* + +l (Visual Mode) + Uses the selected text as context and prompts for an + instruction (e.g., "refactor this"). The selection is + replaced with the LLM's response. + +k (Normal or Visual Mode) + Opens a vertical split on the right named "[LLM Answer]" + and prompts for a question. The buffer uses Markdown + formatting and is cleared for each new question. + +============================================================================== +COMMANDS *llm-commands* + +:LLMModel [model] *:LLMModel* + Switch the current model (|g:llm_model|). + Supports custom tab-completion for all locally + available Ollama models. If called without arguments, + it displays the current model. + +============================================================================== +CONFIGURATION *llm-config* + +The following variables can be set in your |.vimrc| to override defaults: + +g:llm_model *g:llm_model* + The Ollama model name to use. + Default: 'granite4.1:3b' + +g:llm_url *g:llm_url* + The base URL for the Ollama API. + Default: 'http://127.0.0.1:11434' + +g:llm_system_prompt_question *g:llm_system_prompt_question* + The system prompt used for general questions. + +g:llm_visual_task_prompt *g:llm_visual_task_prompt* + The prompt used for code replacement in visual mode. + +g:llm_disable_mappings *g:llm_disable_mappings* + If set to 1, the default mappings for l and k are + not defined. + Default: 0 + +============================================================================== +CUSTOM MAPPINGS *llm-custom-mappings* + +If you disable default mappings, you can define your own using the +following functions: + + llm#AskQuestion(is_visual) + llm#RequestCompletion(is_visual) + +Example: +> + let g:llm_disable_mappings = 1 + nnoremap :call llm#AskQuestion(0) + xnoremap :call llm#AskQuestion(1) + xnoremap :call llm#RequestCompletion(1) +< +============================================================================== +vim:tw=78:ts=8:ft=help:norl: diff --git a/main.go b/main.go new file mode 100644 index 0000000000000000000000000000000000000000..8748cb32225b7c05ed2e1f02ebe85f0b7133a71d --- /dev/null +++ b/main.go @@ -0,0 +1,93 @@ +package main + +import ( + "flag" + "fmt" + "io/fs" + "log" + "net/http" + "os" + "os/signal" + "syscall" + "time" +) + +const Version = "0.1.0" + +type loggingResponseWriter struct { + http.ResponseWriter + statusCode int +} + +func (lrw *loggingResponseWriter) WriteHeader(code int) { + lrw.statusCode = code + lrw.ResponseWriter.WriteHeader(code) +} + +func loggingMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + lrw := &loggingResponseWriter{w, http.StatusOK} + next.ServeHTTP(lrw, r) + log.Printf("%s %s %d %v", r.Method, r.URL.Path, lrw.statusCode, time.Since(start)) + }) +} + +var ( + GlobalConfig *Config +) + +func main() { + var showVersion bool + flag.StringVar(&ConfigPath, "c", "config.yaml", "custom config path") + flag.BoolVar(&showVersion, "v", false, "show version") + flag.Parse() + + if showVersion { + fmt.Printf("bbgit version %s\n", Version) + return + } + + var err error + GlobalConfig, err = loadConfig(ConfigPath) + if err != nil { + log.Fatalf("Error loading config: %v", err) + } + + LoadCache() + StartSaver() + + mux := http.NewServeMux() + + staticSub, _ := fs.Sub(staticFS, "static") + mux.Handle("GET /static/", http.StripPrefix("/static/", http.FileServer(http.FS(staticSub)))) + mux.HandleFunc("GET /{$}", homeHandler) + mux.HandleFunc("GET /r/{name}", repoHandler) + mux.HandleFunc("GET /r/{name}/readme", readmeHandler) + mux.HandleFunc("GET /r/{name}/license", licenseHandler) + mux.HandleFunc("GET /r/{name}/markers", markersHandler) + mux.HandleFunc("GET /r/{name}/commits.rss", repoCommitsRSSHandler) + mux.HandleFunc("GET /r/{name}/tags.rss", repoTagsRSSHandler) + mux.HandleFunc("GET /r/{name}/files/{path...}", filesHandler) + mux.HandleFunc("GET /r/{name}/blob/{path...}", blobHandler) + mux.HandleFunc("GET /r/{name}/raw/{path...}", rawHandler) + mux.HandleFunc("GET /r/{name}/archive/{path...}", archiveHandler) + mux.HandleFunc("GET /r/{name}/c/{hash}", commitHandler) + mux.HandleFunc("GET /r/{name}/c/{hash}/patch", patchHandler) + + go func() { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM) + <-sigChan + log.Println("Shutting down... saving cache.") + if err := SaveCache(); err != nil { + log.Printf("Error saving cache on shutdown: %v", err) + } + os.Exit(0) + }() + + fmt.Printf("Server starting on :8080 (config: %s)...\n", ConfigPath) + if err := http.ListenAndServe(":8080", loggingMiddleware(mux)); err != nil { + log.Fatalf("Error starting server: %v", err) + } +} diff --git a/ollama.md b/ollama.md new file mode 100644 index 0000000000000000000000000000000000000000..150479e6a079e892e786f2801bb8de5b50802100 --- /dev/null +++ b/ollama.md @@ -0,0 +1,1931 @@ +# API + +> Note: Ollama's API docs are moving to https://docs.ollama.com/api + +## Endpoints + +- [Generate a completion](#generate-a-completion) +- [Generate a chat completion](#generate-a-chat-completion) +- [Create a Model](#create-a-model) +- [List Local Models](#list-local-models) +- [Show Model Information](#show-model-information) +- [Copy a Model](#copy-a-model) +- [Delete a Model](#delete-a-model) +- [Pull a Model](#pull-a-model) +- [Push a Model](#push-a-model) +- [Generate Embeddings](#generate-embeddings) +- [List Running Models](#list-running-models) +- [Version](#version) +- [Experimental: Image Generation](#image-generation-experimental) + +## Conventions + +### Model names + +Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version. + +### Durations + +All durations are returned in nanoseconds. + +### Streaming responses + +Certain endpoints stream responses as JSON objects. Streaming can be disabled by providing `{"stream": false}` for these endpoints. + +## Generate a completion + +``` +POST /api/generate +``` + +Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request. + +### Parameters + +- `model`: (required) the [model name](#model-names) +- `prompt`: the prompt to generate a response for +- `suffix`: the text after the model response +- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`) +- `think`: (for thinking models) should the model think before responding? + +Advanced parameters (optional): + +- `format`: the format to return a response in. Format can be `json` or a JSON schema +- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature` +- `system`: system message to (overrides what is defined in the `Modelfile`) +- `template`: the prompt template to use (overrides what is defined in the `Modelfile`) +- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects +- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API +- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) +- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory + +Experimental image generation parameters (for image generation models only): + +> [!WARNING] +> These parameters are experimental and may change in future versions. + +- `width`: width of the generated image in pixels +- `height`: height of the generated image in pixels +- `steps`: number of diffusion steps + +#### Structured outputs + +Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below. + +#### JSON mode + +Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below. + +> [!IMPORTANT] +> It's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace. + +### Examples + +#### Generate request (Streaming) + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "llama3.2", + "prompt": "Why is the sky blue?" +}' +``` + +##### Response + +A stream of JSON objects is returned: + +```json +{ + "model": "llama3.2", + "created_at": "2023-08-04T08:52:19.385406455-07:00", + "response": "The", + "done": false +} +``` + +The final response in the stream also includes additional data about the generation: + +- `total_duration`: time spent generating the response +- `load_duration`: time spent in nanoseconds loading the model +- `prompt_eval_count`: number of tokens in the prompt +- `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt +- `eval_count`: number of tokens in the response +- `eval_duration`: time in nanoseconds spent generating the response +- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory +- `response`: empty if the response was streamed, if not streamed, this will contain the full response + +To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` \* `10^9`. + +```json +{ + "model": "llama3.2", + "created_at": "2023-08-04T19:22:45.499127Z", + "response": "", + "done": true, + "context": [1, 2, 3], + "total_duration": 10706818083, + "load_duration": 6338219291, + "prompt_eval_count": 26, + "prompt_eval_duration": 130079000, + "eval_count": 259, + "eval_duration": 4232710000 +} +``` + +#### Request (No streaming) + +##### Request + +A response can be received in one reply when streaming is off. + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "llama3.2", + "prompt": "Why is the sky blue?", + "stream": false +}' +``` + +##### Response + +If `stream` is set to `false`, the response will be a single JSON object: + +```json +{ + "model": "llama3.2", + "created_at": "2023-08-04T19:22:45.499127Z", + "response": "The sky is blue because it is the color of the sky.", + "done": true, + "context": [1, 2, 3], + "total_duration": 5043500667, + "load_duration": 5025959, + "prompt_eval_count": 26, + "prompt_eval_duration": 325953000, + "eval_count": 290, + "eval_duration": 4709213000 +} +``` + +#### Request (with suffix) + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "codellama:code", + "prompt": "def compute_gcd(a, b):", + "suffix": " return result", + "options": { + "temperature": 0 + }, + "stream": false +}' +``` + +##### Response + +```json5 +{ + "model": "codellama:code", + "created_at": "2024-07-22T20:47:51.147561Z", + "response": "\n if a == 0:\n return b\n else:\n return compute_gcd(b % a, a)\n\ndef compute_lcm(a, b):\n result = (a * b) / compute_gcd(a, b)\n", + "done": true, + "done_reason": "stop", + "context": [...], + "total_duration": 1162761250, + "load_duration": 6683708, + "prompt_eval_count": 17, + "prompt_eval_duration": 201222000, + "eval_count": 63, + "eval_duration": 953997000 +} +``` + +#### Request (Structured outputs) + +##### Request + +```shell +curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{ + "model": "llama3.1:8b", + "prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON", + "stream": false, + "format": { + "type": "object", + "properties": { + "age": { + "type": "integer" + }, + "available": { + "type": "boolean" + } + }, + "required": [ + "age", + "available" + ] + } +}' +``` + +##### Response + +```json +{ + "model": "llama3.1:8b", + "created_at": "2024-12-06T00:48:09.983619Z", + "response": "{\n \"age\": 22,\n \"available\": true\n}", + "done": true, + "done_reason": "stop", + "context": [1, 2, 3], + "total_duration": 1075509083, + "load_duration": 567678166, + "prompt_eval_count": 28, + "prompt_eval_duration": 236000000, + "eval_count": 16, + "eval_duration": 269000000 +} +``` + +#### Request (JSON mode) + +> [!IMPORTANT] +> When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON. + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "llama3.2", + "prompt": "What color is the sky at different times of the day? Respond using JSON", + "format": "json", + "stream": false +}' +``` + +##### Response + +```json +{ + "model": "llama3.2", + "created_at": "2023-11-09T21:07:55.186497Z", + "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n", + "done": true, + "context": [1, 2, 3], + "total_duration": 4648158584, + "load_duration": 4071084, + "prompt_eval_count": 36, + "prompt_eval_duration": 439038000, + "eval_count": 180, + "eval_duration": 4196918000 +} +``` + +The value of `response` will be a string containing JSON similar to: + +```json +{ + "morning": { + "color": "blue" + }, + "noon": { + "color": "blue-gray" + }, + "afternoon": { + "color": "warm gray" + }, + "evening": { + "color": "orange" + } +} +``` + +#### Request (with images) + +To submit images to multimodal models such as `llava` or `bakllava`, provide a list of base64-encoded `images`: + +#### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "llava", + "prompt":"What is in this picture?", + "stream": false, + "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"] +}' +``` + +#### Response + +```json +{ + "model": "llava", + "created_at": "2023-11-03T15:36:02.583064Z", + "response": "A happy cartoon character, which is cute and cheerful.", + "done": true, + "context": [1, 2, 3], + "total_duration": 2938432250, + "load_duration": 2559292, + "prompt_eval_count": 1, + "prompt_eval_duration": 2195557000, + "eval_count": 44, + "eval_duration": 736432000 +} +``` + +#### Request (Raw Mode) + +In some cases, you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable templating. Also note that raw mode will not return a context. + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "mistral", + "prompt": "[INST] why is the sky blue? [/INST]", + "raw": true, + "stream": false +}' +``` + +#### Request (Reproducible outputs) + +For reproducible outputs, set `seed` to a number: + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "mistral", + "prompt": "Why is the sky blue?", + "options": { + "seed": 123 + } +}' +``` + +##### Response + +```json +{ + "model": "mistral", + "created_at": "2023-11-03T15:36:02.583064Z", + "response": " The sky appears blue because of a phenomenon called Rayleigh scattering.", + "done": true, + "total_duration": 8493852375, + "load_duration": 6589624375, + "prompt_eval_count": 14, + "prompt_eval_duration": 119039000, + "eval_count": 110, + "eval_duration": 1779061000 +} +``` + +#### Generate request (With options) + +If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override. + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "llama3.2", + "prompt": "Why is the sky blue?", + "stream": false, + "options": { + "num_keep": 5, + "seed": 42, + "num_predict": 100, + "top_k": 20, + "top_p": 0.9, + "min_p": 0.0, + "typical_p": 0.7, + "repeat_last_n": 33, + "temperature": 0.8, + "repeat_penalty": 1.2, + "presence_penalty": 1.5, + "frequency_penalty": 1.0, + "penalize_newline": true, + "stop": ["\n", "user:"], + "numa": false, + "num_ctx": 1024, + "num_batch": 2, + "num_gpu": 1, + "main_gpu": 0, + "use_mmap": true, + "num_thread": 8 + } +}' +``` + +##### Response + +```json +{ + "model": "llama3.2", + "created_at": "2023-08-04T19:22:45.499127Z", + "response": "The sky is blue because it is the color of the sky.", + "done": true, + "context": [1, 2, 3], + "total_duration": 4935886791, + "load_duration": 534986708, + "prompt_eval_count": 26, + "prompt_eval_duration": 107345000, + "eval_count": 237, + "eval_duration": 4289432000 +} +``` + +#### Load a model + +If an empty prompt is provided, the model will be loaded into memory. + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "llama3.2" +}' +``` + +##### Response + +A single JSON object is returned: + +```json +{ + "model": "llama3.2", + "created_at": "2023-12-18T19:52:07.071755Z", + "response": "", + "done": true +} +``` + +#### Unload a model + +If an empty prompt is provided and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory. + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "llama3.2", + "keep_alive": 0 +}' +``` + +##### Response + +A single JSON object is returned: + +```json +{ + "model": "llama3.2", + "created_at": "2024-09-12T03:54:03.516566Z", + "response": "", + "done": true, + "done_reason": "unload" +} +``` + +## Generate a chat completion + +``` +POST /api/chat +``` + +Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. Streaming can be disabled using `"stream": false`. The final response object will include statistics and additional data from the request. + +### Parameters + +- `model`: (required) the [model name](#model-names) +- `messages`: the messages of the chat, this can be used to keep a chat memory +- `tools`: list of tools in JSON for the model to use if supported +- `think`: (for thinking models) should the model think before responding? + +The `message` object has the following fields: + +- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool` +- `content`: the content of the message +- `thinking`: (for thinking models) the model's thinking process +- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`) +- `tool_calls` (optional): a list of tools in JSON that the model wants to use +- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result + +Advanced parameters (optional): + +- `format`: the format to return a response in. Format can be `json` or a JSON schema. +- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature` +- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects +- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) + +### Tool calling + +Tool calling is supported by providing a list of tools in the `tools` parameter. The model will generate a response that includes a list of tool calls. See the [Chat request (Streaming with tools)](#chat-request-streaming-with-tools) example below. + +Models can also explain the result of the tool call in the response. See the [Chat request (With history, with tools)](#chat-request-with-history-with-tools) example below. + +[See models with tool calling capabilities](https://ollama.com/search?c=tool). + +### Structured outputs + +Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below. + +### Examples + +#### Chat request (Streaming) + +##### Request + +Send a chat message with a streaming response. + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [ + { + "role": "user", + "content": "why is the sky blue?" + } + ] +}' +``` + +##### Response + +A stream of JSON objects is returned: + +```json +{ + "model": "llama3.2", + "created_at": "2023-08-04T08:52:19.385406455-07:00", + "message": { + "role": "assistant", + "content": "The", + "images": null + }, + "done": false +} +``` + +Final response: + +```json +{ + "model": "llama3.2", + "created_at": "2023-08-04T19:22:45.499127Z", + "message": { + "role": "assistant", + "content": "" + }, + "done": true, + "total_duration": 4883583458, + "load_duration": 1334875, + "prompt_eval_count": 26, + "prompt_eval_duration": 342546000, + "eval_count": 282, + "eval_duration": 4535599000 +} +``` + +#### Chat request (Streaming with tools) + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [ + { + "role": "user", + "content": "what is the weather in tokyo?" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather in a given city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to get the weather for" + } + }, + "required": ["city"] + } + } + } + ], + "stream": true +}' +``` + +##### Response + +A stream of JSON objects is returned: + +```json +{ + "model": "llama3.2", + "created_at": "2025-07-07T20:22:19.184789Z", + "message": { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": { + "city": "Tokyo" + } + } + } + ] + }, + "done": false +} +``` + +Final response: + +```json +{ + "model": "llama3.2", + "created_at": "2025-07-07T20:22:19.19314Z", + "message": { + "role": "assistant", + "content": "" + }, + "done_reason": "stop", + "done": true, + "total_duration": 182242375, + "load_duration": 41295167, + "prompt_eval_count": 169, + "prompt_eval_duration": 24573166, + "eval_count": 15, + "eval_duration": 115959084 +} +``` + +#### Chat request (No streaming) + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [ + { + "role": "user", + "content": "why is the sky blue?" + } + ], + "stream": false +}' +``` + +##### Response + +```json +{ + "model": "llama3.2", + "created_at": "2023-12-12T14:13:43.416799Z", + "message": { + "role": "assistant", + "content": "Hello! How are you today?" + }, + "done": true, + "total_duration": 5191566416, + "load_duration": 2154458, + "prompt_eval_count": 26, + "prompt_eval_duration": 383809000, + "eval_count": 298, + "eval_duration": 4799921000 +} +``` + +#### Chat request (No streaming, with tools) + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [ + { + "role": "user", + "content": "what is the weather in tokyo?" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather in a given city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to get the weather for" + } + }, + "required": ["city"] + } + } + } + ], + "stream": false +}' +``` + +##### Response + +```json +{ + "model": "llama3.2", + "created_at": "2025-07-07T20:32:53.844124Z", + "message": { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": { + "city": "Tokyo" + } + } + } + ] + }, + "done_reason": "stop", + "done": true, + "total_duration": 3244883583, + "load_duration": 2969184542, + "prompt_eval_count": 169, + "prompt_eval_duration": 141656333, + "eval_count": 18, + "eval_duration": 133293625 +} +``` + +#### Chat request (Structured outputs) + +##### Request + +```shell +curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{ + "model": "llama3.1", + "messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}], + "stream": false, + "format": { + "type": "object", + "properties": { + "age": { + "type": "integer" + }, + "available": { + "type": "boolean" + } + }, + "required": [ + "age", + "available" + ] + }, + "options": { + "temperature": 0 + } +}' +``` + +##### Response + +```json +{ + "model": "llama3.1", + "created_at": "2024-12-06T00:46:58.265747Z", + "message": { + "role": "assistant", + "content": "{\"age\": 22, \"available\": false}" + }, + "done_reason": "stop", + "done": true, + "total_duration": 2254970291, + "load_duration": 574751416, + "prompt_eval_count": 34, + "prompt_eval_duration": 1502000000, + "eval_count": 12, + "eval_duration": 175000000 +} +``` + +#### Chat request (With History) + +Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting. + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [ + { + "role": "user", + "content": "why is the sky blue?" + }, + { + "role": "assistant", + "content": "due to rayleigh scattering." + }, + { + "role": "user", + "content": "how is that different than mie scattering?" + } + ] +}' +``` + +##### Response + +A stream of JSON objects is returned: + +```json +{ + "model": "llama3.2", + "created_at": "2023-08-04T08:52:19.385406455-07:00", + "message": { + "role": "assistant", + "content": "The" + }, + "done": false +} +``` + +Final response: + +```json +{ + "model": "llama3.2", + "created_at": "2023-08-04T19:22:45.499127Z", + "done": true, + "total_duration": 8113331500, + "load_duration": 6396458, + "prompt_eval_count": 61, + "prompt_eval_duration": 398801000, + "eval_count": 468, + "eval_duration": 7701267000 +} +``` + +#### Chat request (With history, with tools) + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [ + { + "role": "user", + "content": "what is the weather in Toronto?" + }, + // the message from the model appended to history + { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": { + "city": "Toronto" + } + } + } + ] + }, + // the tool call result appended to history + { + "role": "tool", + "content": "11 degrees celsius", + "tool_name": "get_weather" + } + ], + "stream": false, + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather in a given city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": "The city to get the weather for" + } + }, + "required": ["city"] + } + } + } + ] +}' +``` + +##### Response + +```json +{ + "model": "llama3.2", + "created_at": "2025-07-07T20:43:37.688511Z", + "message": { + "role": "assistant", + "content": "The current temperature in Toronto is 11°C." + }, + "done_reason": "stop", + "done": true, + "total_duration": 890771750, + "load_duration": 707634750, + "prompt_eval_count": 94, + "prompt_eval_duration": 91703208, + "eval_count": 11, + "eval_duration": 90282125 +} +``` + +#### Chat request (with images) + +##### Request + +Send a chat message with images. The images should be provided as an array, with the individual images encoded in Base64. + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llava", + "messages": [ + { + "role": "user", + "content": "what is in this image?", + "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"] + } + ] +}' +``` + +##### Response + +```json +{ + "model": "llava", + "created_at": "2023-12-13T22:42:50.203334Z", + "message": { + "role": "assistant", + "content": " The image features a cute, little pig with an angry facial expression. It's wearing a heart on its shirt and is waving in the air. This scene appears to be part of a drawing or sketching project.", + "images": null + }, + "done": true, + "total_duration": 1668506709, + "load_duration": 1986209, + "prompt_eval_count": 26, + "prompt_eval_duration": 359682000, + "eval_count": 83, + "eval_duration": 1303285000 +} +``` + +#### Chat request (Reproducible outputs) + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [ + { + "role": "user", + "content": "Hello!" + } + ], + "options": { + "seed": 101, + "temperature": 0 + } +}' +``` + +##### Response + +```json +{ + "model": "llama3.2", + "created_at": "2023-12-12T14:13:43.416799Z", + "message": { + "role": "assistant", + "content": "Hello! How are you today?" + }, + "done": true, + "total_duration": 5191566416, + "load_duration": 2154458, + "prompt_eval_count": 26, + "prompt_eval_duration": 383809000, + "eval_count": 298, + "eval_duration": 4799921000 +} +``` + +#### Chat request (with tools) + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [ + { + "role": "user", + "content": "What is the weather today in Paris?" + } + ], + "stream": false, + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to get the weather for, e.g. San Francisco, CA" + }, + "format": { + "type": "string", + "description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["location", "format"] + } + } + } + ] +}' +``` + +##### Response + +```json +{ + "model": "llama3.2", + "created_at": "2024-07-22T20:33:28.123648Z", + "message": { + "role": "assistant", + "content": "", + "tool_calls": [ + { + "function": { + "name": "get_current_weather", + "arguments": { + "format": "celsius", + "location": "Paris, FR" + } + } + } + ] + }, + "done_reason": "stop", + "done": true, + "total_duration": 885095291, + "load_duration": 3753500, + "prompt_eval_count": 122, + "prompt_eval_duration": 328493000, + "eval_count": 33, + "eval_duration": 552222000 +} +``` + +#### Load a model + +If the messages array is empty, the model will be loaded into memory. + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [] +}' +``` + +##### Response + +```json +{ + "model": "llama3.2", + "created_at": "2024-09-12T21:17:29.110811Z", + "message": { + "role": "assistant", + "content": "" + }, + "done_reason": "load", + "done": true +} +``` + +#### Unload a model + +If the messages array is empty and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory. + +##### Request + +```shell +curl http://localhost:11434/api/chat -d '{ + "model": "llama3.2", + "messages": [], + "keep_alive": 0 +}' +``` + +##### Response + +A single JSON object is returned: + +```json +{ + "model": "llama3.2", + "created_at": "2024-09-12T21:33:17.547535Z", + "message": { + "role": "assistant", + "content": "" + }, + "done_reason": "unload", + "done": true +} +``` + +## Create a Model + +``` +POST /api/create +``` + +Create a model from: + +- another model; +- a safetensors directory; or +- a GGUF file. + +If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field. + +### Parameters + +- `model`: name of the model to create +- `from`: (optional) name of an existing model to create the new model from +- `files`: (optional) a dictionary of file names to SHA256 digests of blobs to create the model from +- `adapters`: (optional) a dictionary of file names to SHA256 digests of blobs for LORA adapters +- `template`: (optional) the prompt template for the model +- `license`: (optional) a string or list of strings containing the license or licenses for the model +- `system`: (optional) a string containing the system prompt for the model +- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.mdx#valid-parameters-and-values) for a list of parameters) +- `messages`: (optional) a list of message objects used to create a conversation +- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects +- `quantize` (optional): quantize a non-quantized (e.g. float16) model + +#### Quantization types + +| Type | Recommended | +| ------ | :---------: | +| q4_K_M | \* | +| q4_K_S | | +| q8_0 | \* | + +### Examples + +#### Create a new model + +Create a new model from an existing model. + +##### Request + +```shell +curl http://localhost:11434/api/create -d '{ + "model": "mario", + "from": "llama3.2", + "system": "You are Mario from Super Mario Bros." +}' +``` + +##### Response + +A stream of JSON objects is returned: + +```json +{"status":"reading model metadata"} +{"status":"creating system layer"} +{"status":"using already created layer sha256:22f7f8ef5f4c791c1b03d7eb414399294764d7cc82c7e94aa81a1feb80a983a2"} +{"status":"using already created layer sha256:8c17c2ebb0ea011be9981cc3922db8ca8fa61e828c5d3f44cb6ae342bf80460b"} +{"status":"using already created layer sha256:7c23fb36d80141c4ab8cdbb61ee4790102ebd2bf7aeff414453177d4f2110e5d"} +{"status":"using already created layer sha256:2e0493f67d0c8c9c68a8aeacdf6a38a2151cb3c4c1d42accf296e19810527988"} +{"status":"using already created layer sha256:2759286baa875dc22de5394b4a925701b1896a7e3f8e53275c36f75a877a82c9"} +{"status":"writing layer sha256:df30045fe90f0d750db82a058109cecd6d4de9c90a3d75b19c09e5f64580bb42"} +{"status":"writing layer sha256:f18a68eb09bf925bb1b669490407c1b1251c5db98dc4d3d81f3088498ea55690"} +{"status":"writing manifest"} +{"status":"success"} +``` + +#### Quantize a model + +Quantize a non-quantized model. + +##### Request + +```shell +curl http://localhost:11434/api/create -d '{ + "model": "llama3.2:quantized", + "from": "llama3.2:3b-instruct-fp16", + "quantize": "q4_K_M" +}' +``` + +##### Response + +A stream of JSON objects is returned: + +```json +{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302} +{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552} +{"status":"verifying conversion"} +{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"} +{"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"} +{"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"} +{"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"} +{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"} +{"status":"writing manifest"} +{"status":"success"} +``` + +#### Create a model from GGUF + +Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API. + +##### Request + +```shell +curl http://localhost:11434/api/create -d '{ + "model": "my-gguf-model", + "files": { + "test.gguf": "sha256:432f310a77f4650a88d0fd59ecdd7cebed8d684bafea53cbff0473542964f0c3" + } +}' +``` + +##### Response + +A stream of JSON objects is returned: + +```json +{"status":"parsing GGUF"} +{"status":"using existing layer sha256:432f310a77f4650a88d0fd59ecdd7cebed8d684bafea53cbff0473542964f0c3"} +{"status":"writing manifest"} +{"status":"success"} +``` + +#### Create a model from a Safetensors directory + +The `files` parameter should include a dictionary of files for the safetensors model which includes the file names and SHA256 digest of each file. Use [/api/blobs/:digest](#push-a-blob) to first push each of the files to the server before calling this API. Files will remain in the cache until the Ollama server is restarted. + +##### Request + +```shell +curl http://localhost:11434/api/create -d '{ + "model": "fred", + "files": { + "config.json": "sha256:dd3443e529fb2290423a0c65c2d633e67b419d273f170259e27297219828e389", + "generation_config.json": "sha256:88effbb63300dbbc7390143fbbdd9d9fa50587b37e8bfd16c8c90d4970a74a36", + "special_tokens_map.json": "sha256:b7455f0e8f00539108837bfa586c4fbf424e31f8717819a6798be74bef813d05", + "tokenizer.json": "sha256:bbc1904d35169c542dffbe1f7589a5994ec7426d9e5b609d07bab876f32e97ab", + "tokenizer_config.json": "sha256:24e8a6dc2547164b7002e3125f10b415105644fcf02bf9ad8b674c87b1eaaed6", + "model.safetensors": "sha256:1ff795ff6a07e6a68085d206fb84417da2f083f68391c2843cd2b8ac6df8538f" + } +}' +``` + +##### Response + +A stream of JSON objects is returned: + +```shell +{"status":"converting model"} +{"status":"creating new layer sha256:05ca5b813af4a53d2c2922933936e398958855c44ee534858fcfd830940618b6"} +{"status":"using autodetected template llama3-instruct"} +{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"} +{"status":"writing manifest"} +{"status":"success"} +``` + +## Check if a Blob Exists + +```shell +HEAD /api/blobs/:digest +``` + +Ensures that the file blob (Binary Large Object) used with create a model exists on the server. This checks your Ollama server and not ollama.com. + +### Query Parameters + +- `digest`: the SHA256 digest of the blob + +### Examples + +#### Request + +```shell +curl -I http://localhost:11434/api/blobs/sha256:29fdb92e57cf0827ded04ae6461b5931d01fa595843f55d36f5b275a52087dd2 +``` + +#### Response + +Return 200 OK if the blob exists, 404 Not Found if it does not. + +## Push a Blob + +``` +POST /api/blobs/:digest +``` + +Push a file to the Ollama server to create a "blob" (Binary Large Object). + +### Query Parameters + +- `digest`: the expected SHA256 digest of the file + +### Examples + +#### Request + +```shell +curl -T model.gguf -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf0827ded04ae6461b5931d01fa595843f55d36f5b275a52087dd2 +``` + +#### Response + +Return 201 Created if the blob was successfully created, 400 Bad Request if the digest used is not expected. + +## List Local Models + +``` +GET /api/tags +``` + +List models that are available locally. + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/tags +``` + +#### Response + +A single JSON object will be returned. + +```json +{ + "models": [ + { + "name": "deepseek-r1:latest", + "model": "deepseek-r1:latest", + "modified_at": "2025-05-10T08:06:48.639712648-07:00", + "size": 4683075271, + "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163", + "details": { + "parent_model": "", + "format": "gguf", + "family": "qwen2", + "families": ["qwen2"], + "parameter_size": "7.6B", + "quantization_level": "Q4_K_M" + } + }, + { + "name": "llama3.2:latest", + "model": "llama3.2:latest", + "modified_at": "2025-05-04T17:37:44.706015396-07:00", + "size": 2019393189, + "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72", + "details": { + "parent_model": "", + "format": "gguf", + "family": "llama", + "families": ["llama"], + "parameter_size": "3.2B", + "quantization_level": "Q4_K_M" + } + } + ] +} +``` + +## Show Model Information + +``` +POST /api/show +``` + +Show information about a model including details, modelfile, template, parameters, license, system prompt. + +### Parameters + +- `model`: name of the model to show +- `verbose`: (optional) if set to `true`, returns full data for verbose response fields + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/show -d '{ + "model": "llava" +}' +``` + +#### Response + +```json5 +{ + modelfile: '# Modelfile generated by "ollama show"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE """{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: """\nPARAMETER num_ctx 4096\nPARAMETER stop "\u003c/s\u003e"\nPARAMETER stop "USER:"\nPARAMETER stop "ASSISTANT:"', + parameters: 'num_keep 24\nstop "<|start_header_id|>"\nstop "<|end_header_id|>"\nstop "<|eot_id|>"', + template: "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>", + details: { + parent_model: "", + format: "gguf", + family: "llama", + families: ["llama"], + parameter_size: "8.0B", + quantization_level: "Q4_0", + }, + model_info: { + "general.architecture": "llama", + "general.file_type": 2, + "general.parameter_count": 8030261248, + "general.quantization_version": 2, + "llama.attention.head_count": 32, + "llama.attention.head_count_kv": 8, + "llama.attention.layer_norm_rms_epsilon": 0.00001, + "llama.block_count": 32, + "llama.context_length": 8192, + "llama.embedding_length": 4096, + "llama.feed_forward_length": 14336, + "llama.rope.dimension_count": 128, + "llama.rope.freq_base": 500000, + "llama.vocab_size": 128256, + "tokenizer.ggml.bos_token_id": 128000, + "tokenizer.ggml.eos_token_id": 128009, + "tokenizer.ggml.merges": [], // populates if `verbose=true` + "tokenizer.ggml.model": "gpt2", + "tokenizer.ggml.pre": "llama-bpe", + "tokenizer.ggml.token_type": [], // populates if `verbose=true` + "tokenizer.ggml.tokens": [], // populates if `verbose=true` + }, + capabilities: ["completion", "vision"], +} +``` + +## Copy a Model + +``` +POST /api/copy +``` + +Copy a model. Creates a model with another name from an existing model. + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/copy -d '{ + "source": "llama3.2", + "destination": "llama3-backup" +}' +``` + +#### Response + +Returns a 200 OK if successful, or a 404 Not Found if the source model doesn't exist. + +## Delete a Model + +``` +DELETE /api/delete +``` + +Delete a model and its data. + +### Parameters + +- `model`: model name to delete + +### Examples + +#### Request + +```shell +curl -X DELETE http://localhost:11434/api/delete -d '{ + "model": "llama3:13b" +}' +``` + +#### Response + +Returns a 200 OK if successful, 404 Not Found if the model to be deleted doesn't exist. + +## Pull a Model + +``` +POST /api/pull +``` + +Download a model from the ollama library. Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress. + +### Parameters + +- `model`: name of the model to pull +- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development. +- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/pull -d '{ + "model": "llama3.2" +}' +``` + +#### Response + +If `stream` is not specified, or set to `true`, a stream of JSON objects is returned: + +The first object is the manifest: + +```json +{ + "status": "pulling manifest" +} +``` + +Then there is a series of downloading responses. Until any of the download is completed, the `completed` key may not be included. The number of files to be downloaded depends on the number of layers specified in the manifest. + +```json +{ + "status": "pulling digestname", + "digest": "digestname", + "total": 2142590208, + "completed": 241970 +} +``` + +After all the files are downloaded, the final responses are: + +```json +{ + "status": "verifying sha256 digest" +} +{ + "status": "writing manifest" +} +{ + "status": "removing any unused layers" +} +{ + "status": "success" +} +``` + +if `stream` is set to false, then the response is a single JSON object: + +```json +{ + "status": "success" +} +``` + +## Push a Model + +``` +POST /api/push +``` + +Upload a model to a model library. Requires registering for ollama.ai and adding a public key first. + +### Parameters + +- `model`: name of the model to push in the form of `/:` +- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development. +- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/push -d '{ + "model": "mattw/pygmalion:latest" +}' +``` + +#### Response + +If `stream` is not specified, or set to `true`, a stream of JSON objects is returned: + +```json +{ "status": "retrieving manifest" } +``` + +and then: + +```json +{ + "status": "starting upload", + "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab", + "total": 1928429856 +} +``` + +Then there is a series of uploading responses: + +```json +{ + "status": "starting upload", + "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab", + "total": 1928429856 +} +``` + +Finally, when the upload is complete: + +```json +{"status":"pushing manifest"} +{"status":"success"} +``` + +If `stream` is set to `false`, then the response is a single JSON object: + +```json +{ "status": "success" } +``` + +## Generate Embeddings + +``` +POST /api/embed +``` + +Generate embeddings from a model + +### Parameters + +- `model`: name of model to generate embeddings from +- `input`: text or list of text to generate embeddings for + +Advanced parameters: + +- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true` +- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature` +- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) +- `dimensions`: number of dimensions for the embedding + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/embed -d '{ + "model": "all-minilm", + "input": "Why is the sky blue?" +}' +``` + +#### Response + +```json +{ + "model": "all-minilm", + "embeddings": [ + [ + 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, + 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 + ] + ], + "total_duration": 14143917, + "load_duration": 1019500, + "prompt_eval_count": 8 +} +``` + +#### Request (Multiple input) + +```shell +curl http://localhost:11434/api/embed -d '{ + "model": "all-minilm", + "input": ["Why is the sky blue?", "Why is the grass green?"] +}' +``` + +#### Response + +```json +{ + "model": "all-minilm", + "embeddings": [ + [ + 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, + 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 + ], + [ + -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725, + 0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481 + ] + ] +} +``` + +## List Running Models + +``` +GET /api/ps +``` + +List models that are currently loaded into memory. + +#### Examples + +### Request + +```shell +curl http://localhost:11434/api/ps +``` + +#### Response + +A single JSON object will be returned. + +```json +{ + "models": [ + { + "name": "mistral:latest", + "model": "mistral:latest", + "size": 5137025024, + "digest": "2ae6f6dd7a3dd734790bbbf58b8909a606e0e7e97e94b7604e0aa7ae4490e6d8", + "details": { + "parent_model": "", + "format": "gguf", + "family": "llama", + "families": ["llama"], + "parameter_size": "7.2B", + "quantization_level": "Q4_0" + }, + "expires_at": "2024-06-04T14:38:31.83753-07:00", + "size_vram": 5137025024 + } + ] +} +``` + +## Generate Embedding + +> Note: this endpoint has been superseded by `/api/embed` + +``` +POST /api/embeddings +``` + +Generate embeddings from a model + +### Parameters + +- `model`: name of model to generate embeddings from +- `prompt`: text to generate embeddings for + +Advanced parameters: + +- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature` +- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/embeddings -d '{ + "model": "all-minilm", + "prompt": "Here is an article about llamas..." +}' +``` + +#### Response + +```json +{ + "embedding": [ + 0.5670403838157654, 0.009260174818336964, 0.23178744316101074, + -0.2916173040866852, -0.8924556970596313, 0.8785552978515625, + -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, + -0.137906014919281 + ] +} +``` + +## Version + +``` +GET /api/version +``` + +Retrieve the Ollama version + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/version +``` + +#### Response + +```json +{ + "version": "0.5.1" +} +``` + +## Experimental Features + +### Image Generation (Experimental) + +> [!WARNING] +> Image generation is experimental and may change in future versions. + +Image generation is now supported through the standard `/api/generate` endpoint when using image generation models. The API automatically detects when an image generation model is being used. + +See the [Generate a completion](#generate-a-completion) section for the full API documentation. The experimental image generation parameters (`width`, `height`, `steps`) are documented there. + +#### Example + +##### Request + +```shell +curl http://localhost:11434/api/generate -d '{ + "model": "x/z-image-turbo", + "prompt": "a sunset over mountains", + "width": 1024, + "height": 768 +}' +``` + +##### Response (streaming) + +Progress updates during generation: + +```json +{ + "model": "x/z-image-turbo", + "created_at": "2024-01-15T10:30:00.000000Z", + "completed": 5, + "total": 20, + "done": false +} +``` + +##### Final Response + +```json +{ + "model": "x/z-image-turbo", + "created_at": "2024-01-15T10:30:15.000000Z", + "image": "iVBORw0KGgoAAAANSUhEUg...", + "done": true, + "done_reason": "stop", + "total_duration": 15000000000, + "load_duration": 2000000000 +} +``` diff --git a/plugin/llm.vim b/plugin/llm.vim new file mode 100644 index 0000000000000000000000000000000000000000..08108d51731c6f438b7e6f1ce3fe6836264084d9 --- /dev/null +++ b/plugin/llm.vim @@ -0,0 +1,12 @@ +if exists('g:loaded_llm') + finish +endif +let g:loaded_llm = 1 + +if !get(g:, 'llm_disable_mappings', 0) + xnoremap l :call llm#RequestCompletion(1) + nnoremap k :call llm#AskQuestion(0) + xnoremap k :call llm#AskQuestion(1) +endif + +command! -nargs=? -complete=customlist,llm#CompleteModel LLMModel call llm#SwitchModel()