diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..31d811a391ea684b51aed961d2312b308fc910da
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+BSD 2-Clause License
+
+Copyright (c) 2026, Mitja Felicijan <mitja.felicijan@gmail.com>
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d18a8d987259d257429d61554445a2bf940876d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+# llm.vim
+
+Asynchronous Vim plugin for Ollama LLMs.
+
+## Features
+- **Code Completion & Refactoring (`<leader>l`)**: Replaces or appends code using the File Skeleton and Local Context as prompts.
+- **Conversational Questions (`<leader>k`)**: Opens a vertical right-split scratch buffer (`ft=markdown`) for queries.
+- **Model Management (`:LLMModel`)**: Command with tab-completion for switching `g:llm_model` based on local Ollama tags.
+- **Asynchronous Execution**: Uses Vim 8 jobs and temporary files for non-blocking I/O.
+- **Indentation Aware**: Uses Vim's `=` operator on inserted code.
+
+## Installation
+```vim
+Plug 'mitjafelicijan/llm.vim'
+```
+
+## Configuration
+Override defaults in your `.vimrc`:
+
+Check for available models https://ollama.com/search.
+
+```vim
+let g:llm_model = 'granite4.1:3b'
+let g:llm_url = 'http://127.0.0.1:11434'
+```
+
+### Custom Mappings
+To disable default mappings and define your own, set `g:llm_disable_mappings` and use the following functions:
+
+```vim
+let g:llm_disable_mappings = 1
+
+" Normal mode question
+nnoremap <C-k> :call llm#AskQuestion(0)<CR>
+" Visual mode question
+xnoremap <C-k> :<C-u>call llm#AskQuestion(1)<CR>
+" Visual mode completion
+xnoremap <C-l> :<C-u>call llm#RequestCompletion(1)<CR>
+```
+
+## Mappings & Commands (Defaults)
+- `<leader>l` (Visual): Refactor or replace selection.
+- `<leader>k` (Normal/Visual): Open `[LLM Answer]` buffer on the right.
+- `:LLMModel`: Switch `g:llm_model`. Supports `<Tab>` completion via Ollama API.
diff --git a/autoload/llm.vim b/autoload/llm.vim
new file mode 100644
index 0000000000000000000000000000000000000000..32fe966caf6956fcac296199c0d2a699378a41fd
--- /dev/null
+++ b/autoload/llm.vim
@@ -0,0 +1,296 @@
+let g:llm_model = get(g:, 'llm_model', 'falcon3:latest')
+let g:llm_url = get(g:, 'llm_url', 'http://127.0.0.1:11434')
+let g:llm_scratch_bufnr = -1
+
+" System Prompts
+let g:llm_system_prompt_code = get(g:, 'llm_system_prompt_code', "Output ONLY raw code. No markdown. No commentary. Do not include comments. Keep logical units on a single line; do not split statements or function calls into multiple lines.")
+let g:llm_system_prompt_question = get(g:, 'llm_system_prompt_question', "You are a helpful assistant. Clear and concise.")
+let g:llm_visual_task_prompt = get(g:, 'llm_visual_task_prompt', "TASK: Replace the following selection. Output ONLY raw replacement code. No markdown formatting. No backticks. No commentary. Do not include comments. Constraint: Do not split statements or function calls into multiple lines; keep logical units on a single line.")
+
+function! llm#GetOrCreateScratch()
+	if g:llm_scratch_bufnr != -1 && bufexists(g:llm_scratch_bufnr)
+		let l:winid = bufwinid(g:llm_scratch_bufnr)
+		if l:winid != -1
+			call win_gotoid(l:winid)
+		else
+			" execute 'topleft split'
+			execute 'vertical botright split'
+			execute 'buffer ' . g:llm_scratch_bufnr
+		endif
+	else
+		" silent execute 'topleft 20new'
+		silent execute 'vertical botright 80new'
+		setlocal buftype=nofile bufhidden=hide noswapfile wrap
+		let g:llm_scratch_bufnr = bufnr('%')
+		silent! execute 'file [LLM Answer]'
+	endif
+
+	setlocal wrap filetype=markdown
+	return g:llm_scratch_bufnr
+endfunction
+
+function! llm#GetStyleInstructions()
+	let l:indent_type = &expandtab ? "spaces" : "tabs"
+	let l:indent_width = &shiftwidth == 0 ? &tabstop : &shiftwidth
+	return printf("Style: Use %d %s for indentation. Match the project's naming convention.", l:indent_width, l:indent_type)
+endfunction
+
+function! llm#GetSurroundingContext(line_start, line_end)
+	let l:top = getline(1, 20)
+	let l:local_start = max([1, a:line_start - 20])
+	let l:local_end = max([1, a:line_start - 1])
+	let l:local = getline(l:local_start, l:local_end)
+
+	let l:ctx = "[File Skeleton]\n" . join(l:top, "\n") . "\n...\n"
+	let l:ctx .= "[Local Context]\n" . join(l:local, "\n")
+	return l:ctx
+endfunction
+
+function! llm#HandleCodeExit(context, job, status)
+	let l:err_msg = ""
+	if filereadable(a:context.err_file)
+		let l:err_msg = join(readfile(a:context.err_file), " ")
+	endif
+
+	if a:status != 0
+		redraw | echoerr "LLM Request Failed (" . a:status . "): " . l:err_msg
+		call delete(a:context.res_file)
+		call delete(a:context.req_file)
+		call delete(a:context.err_file)
+		return
+	endif
+
+	if !filereadable(a:context.res_file)
+		redraw | echoerr "LLM: No response file found"
+		call delete(a:context.req_file)
+		call delete(a:context.err_file)
+		return
+	endif
+
+	let l:content = join(readfile(a:context.res_file, 'b'), "")
+
+	" Clean up temp files
+	call delete(a:context.res_file)
+	call delete(a:context.req_file)
+	call delete(a:context.err_file)
+
+	if empty(l:content)
+		redraw | echoerr "LLM: Empty response"
+		return
+	endif
+
+	try
+		let l:json = json_decode(l:content)
+		if type(l:json) == v:t_dict && has_key(l:json, 'response')
+			let l:response = l:json['response']
+			let l:response = substitute(l:response, '^```\w*\n', '', '')
+			let l:response = substitute(l:response, '\n```$', '', '')
+			let l:response = substitute(l:response, '^```', '', '')
+			let l:response = substitute(l:response, '```$', '', '')
+			let l:out_lines = split(l:response, "\n")
+
+			if bufexists(a:context.bufnr)
+				let l:start_line = 0
+				let l:end_line = 0
+
+				if a:context.is_replacing
+					call deletebufline(a:context.bufnr, a:context.range_start, a:context.range_end)
+					call appendbufline(a:context.bufnr, a:context.range_start - 1, l:out_lines)
+					let l:start_line = a:context.range_start
+					let l:end_line = a:context.range_start + len(l:out_lines) - 1
+				else
+					call appendbufline(a:context.bufnr, a:context.insert_pos, l:out_lines)
+					let l:start_line = a:context.insert_pos + 1
+					let l:end_line = a:context.insert_pos + len(l:out_lines)
+				endif
+
+				" Apply auto-formatting
+				let l:winid = bufwinid(a:context.bufnr)
+				if l:winid != -1 && l:start_line <= l:end_line
+					call win_execute(l:winid, 'normal! ' . l:start_line . 'G=' . l:end_line . 'G')
+				endif
+			endif
+			redraw | echo "LLM: Code Updated"
+		elseif type(l:json) == v:t_dict && has_key(l:json, 'error')
+			redraw | echoerr "LLM Error: " . l:json.error
+		else
+			redraw | echoerr "LLM: Unexpected JSON structure"
+		endif
+	catch
+		redraw | echoerr "LLM: Failed to parse response. Content starts with: " . strpart(l:content, 0, 100)
+	endtry
+endfunction
+
+function! llm#HandleQuestionExit(context, job, status)
+	let l:err_msg = ""
+	if filereadable(a:context.err_file)
+		let l:err_msg = join(readfile(a:context.err_file), " ")
+	endif
+
+	if a:status != 0
+		redraw | echoerr "LLM Request Failed (" . a:status . "): " . l:err_msg
+		call delete(a:context.res_file)
+		call delete(a:context.req_file)
+		call delete(a:context.err_file)
+		return
+	endif
+
+	if !filereadable(a:context.res_file)
+		redraw | echoerr "LLM: No response file found"
+		call delete(a:context.req_file)
+		call delete(a:context.err_file)
+		return
+	endif
+
+	let l:content = join(readfile(a:context.res_file, 'b'), "")
+
+	" Clean up temp files
+	call delete(a:context.res_file)
+	call delete(a:context.req_file)
+	call delete(a:context.err_file)
+
+	if !empty(l:content)
+		try
+			let l:json = json_decode(l:content)
+			if type(l:json) == v:t_dict && has_key(l:json, 'response')
+				if bufexists(g:llm_scratch_bufnr)
+					call appendbufline(g:llm_scratch_bufnr, '$', split(l:json['response'], "\n"))
+					let l:winid = bufwinid(g:llm_scratch_bufnr)
+					if l:winid != -1
+						call win_execute(l:winid, 'normal! gg')
+					endif
+				endif
+				redraw | echo "LLM: Answer Received"
+			elseif type(l:json) == v:t_dict && has_key(l:json, 'error')
+				redraw | echoerr "LLM Error: " . l:json.error
+			endif
+		catch
+			redraw | echoerr "LLM: Failed to parse response"
+		endtry
+	endif
+endfunction
+
+function! llm#StartJob(prompt, context, exit_cb)
+	redraw | echo "LLM: Thinking..."
+	let l:payload_dict = {
+				\ 'model': g:llm_model,
+				\ 'prompt': a:prompt,
+				\ 'stream': v:false
+				\ }
+
+	let l:payload = json_encode(l:payload_dict)
+	let a:context.req_file = tempname()
+	let a:context.res_file = tempname()
+	let a:context.err_file = tempname()
+
+	call writefile([l:payload], a:context.req_file, 'b')
+
+	" Use native job_start redirection instead of shell pipes
+	" Added -f to curl to return non-zero exit code on HTTP errors
+	let l:cmd = ['curl', '-sfS', '-X', 'POST', g:llm_url . '/api/generate', 
+				\ '--connect-timeout', '10',
+				\ '-d', '@' . a:context.req_file]
+
+	let l:job = job_start(l:cmd, {
+				\ 'exit_cb': function(a:exit_cb, [a:context]),
+				\ 'out_io': 'file',
+				\ 'out_name': a:context.res_file,
+				\ 'err_io': 'file',
+				\ 'err_name': a:context.err_file
+				\ })
+endfunction
+
+function! llm#RequestCompletion(is_visual)
+	let l:context = {'mode': 'code', 'bufnr': bufnr('%')}
+	let l:file_info = "File: " . expand('%:t') . " (type: " . &filetype . ")"
+	let l:style = llm#GetStyleInstructions()
+	let l:system_prompt = g:llm_system_prompt_code . "\n" . l:style
+
+	if a:is_visual
+		let l:context.range_start = getpos("'<")[1]
+		let l:context.range_end = getpos("'>")[1]
+		let l:selection = join(getline(l:context.range_start, l:context.range_end), "\n")
+		let l:instruction = input("Instruction: ")
+		redraw
+
+		if empty(l:instruction) && empty(l:selection) | return | endif
+
+		let l:mod_prompt = g:llm_visual_task_prompt . "\n"
+		if l:instruction =~? 'refactor' | let l:mod_prompt .= "Instruction: Refactor for efficiency/readability.\n" | endif
+		if l:instruction =~? 'fix\|bug' | let l:mod_prompt .= "Instruction: Fix bugs/errors.\n" | endif
+		if l:instruction =~? 'doc\|comment' | let l:mod_prompt .= "Instruction: Add documentation.\n" | endif
+		if !empty(l:instruction) | let l:mod_prompt .= "Additional Instruction: " . l:instruction . "\n" | endif
+
+		let l:surround = llm#GetSurroundingContext(l:context.range_start, l:context.range_end)
+		let l:suffix = join(getline(l:context.range_end + 1, min([l:context.range_end + 100, line('$')])), "\n")
+
+		let l:prompt = l:file_info . "\n" . l:style . "\n\n"
+		let l:prompt .= "[CONTEXT - FOR REFERENCE ONLY]\n" . l:surround . "\n"
+		if !empty(l:suffix) | let l:prompt .= "[CODE AFTER SELECTION - FOR REFERENCE ONLY]\n" . l:suffix . "\n" | endif
+		let l:prompt .= "\n" . l:mod_prompt
+		let l:prompt .= "\n[CODE TO REPLACE]\n" . l:selection . "\n\nREPLACEMENT CODE:"
+
+		let l:context.is_replacing = 1
+	endif
+
+	call llm#StartJob(l:prompt, l:context, 'llm#HandleCodeExit')
+endfunction
+
+function! llm#AskQuestion(is_visual)
+	let l:selection = ""
+	let l:line_start = line('.')
+	let l:line_end = line('.')
+	if a:is_visual
+		let l:line_start = getpos("'<")[1]
+		let l:line_end = getpos("'>")[1]
+		let l:selection = join(getline(l:line_start, l:line_end), "\n")
+	endif
+
+	let l:question = input("Ask: ")
+	redraw
+	if empty(l:question) | return | endif
+
+	let l:file_info = "File: " . expand('%:t') . " (type: " . &filetype . ")"
+	let l:surround = llm#GetSurroundingContext(l:line_start, l:line_end)
+	let l:prompt = l:file_info . "\n" . g:llm_system_prompt_question . "\n\n" . l:surround
+	if !empty(l:selection) | let l:prompt .= "\n\nContext Selection:\n" . l:selection | endif
+	let l:prompt .= "\n\nQuestion: " . l:question
+
+	let l:bufnr = llm#GetOrCreateScratch()
+	silent call deletebufline(l:bufnr, 1, '$')
+	call setbufline(l:bufnr, 1, ["Question: " . l:question, "Model: " . g:llm_model, ""])
+	let l:winid = bufwinid(l:bufnr)
+	if l:winid != -1 | call win_execute(l:winid, 'normal! gg') | endif
+
+	call llm#StartJob(l:prompt, {'mode': 'question'}, 'llm#HandleQuestionExit')
+endfunction
+
+function! llm#ListModels()
+	let l:cmd = 'curl -sS --connect-timeout 2 ' . g:llm_url . '/api/tags'
+	let l:output = system(l:cmd)
+	if v:shell_error != 0
+		return []
+	endif
+	try
+		let l:json = json_decode(l:output)
+		if type(l:json) == v:t_dict && has_key(l:json, 'models')
+			return map(l:json.models, 'v:val.name')
+		endif
+	catch
+	endtry
+	return []
+endfunction
+
+function! llm#CompleteModel(ArgLead, CmdLine, CursorPos)
+	let l:models = llm#ListModels()
+	return filter(l:models, 'v:val =~# "^" . a:ArgLead')
+endfunction
+
+function! llm#SwitchModel(model)
+	if empty(a:model)
+		echo "LLM: Current model is " . g:llm_model
+		return
+	endif
+	let g:llm_model = a:model
+	echo "LLM: Model set to " . g:llm_model
+endfunction
diff --git a/doc/llm.txt b/doc/llm.txt
new file mode 100644
index 0000000000000000000000000000000000000000..deee931fd12bfc05a421b1c4335d2840cccba29f
--- /dev/null
+++ b/doc/llm.txt
@@ -0,0 +1,73 @@
+*llm.txt*   Asynchronous LLM interaction for Vim
+
+==============================================================================
+INTRODUCTION                                                   *llm-introduction*
+
+llm.vim is a plugin that allows you to interact with local Large Language
+Models via Ollama. It supports code completion, refactoring, and general
+questions, all handled asynchronously to keep Vim responsive.
+
+==============================================================================
+MAPPINGS                                                       *llm-mappings*
+
+<leader>l           (Visual Mode)
+                    Uses the selected text as context and prompts for an
+                    instruction (e.g., "refactor this"). The selection is
+                    replaced with the LLM's response.
+
+<leader>k           (Normal or Visual Mode)
+                    Opens a vertical split on the right named "[LLM Answer]"
+                    and prompts for a question. The buffer uses Markdown
+                    formatting and is cleared for each new question.
+
+==============================================================================
+COMMANDS                                                        *llm-commands*
+
+:LLMModel [model]                                               *:LLMModel*
+                    Switch the current model (|g:llm_model|).
+                    Supports custom tab-completion for all locally
+                    available Ollama models. If called without arguments,
+                    it displays the current model.
+
+==============================================================================
+CONFIGURATION                                                   *llm-config*
+
+The following variables can be set in your |.vimrc| to override defaults:
+
+g:llm_model                                                 *g:llm_model*
+    The Ollama model name to use.
+    Default: 'granite4.1:3b'
+
+g:llm_url                                                   *g:llm_url*
+    The base URL for the Ollama API.
+    Default: 'http://127.0.0.1:11434'
+
+g:llm_system_prompt_question                   *g:llm_system_prompt_question*
+    The system prompt used for general questions.
+
+g:llm_visual_task_prompt                       *g:llm_visual_task_prompt*
+    The prompt used for code replacement in visual mode.
+
+g:llm_disable_mappings                       *g:llm_disable_mappings*
+    If set to 1, the default mappings for <leader>l and <leader>k are
+    not defined.
+    Default: 0
+
+==============================================================================
+CUSTOM MAPPINGS                                           *llm-custom-mappings*
+
+If you disable default mappings, you can define your own using the
+following functions:
+
+    llm#AskQuestion(is_visual)
+    llm#RequestCompletion(is_visual)
+
+Example:
+>
+    let g:llm_disable_mappings = 1
+    nnoremap <C-k> :call llm#AskQuestion(0)<CR>
+    xnoremap <C-k> :<C-u>call llm#AskQuestion(1)<CR>
+    xnoremap <C-l> :<C-u>call llm#RequestCompletion(1)<CR>
+<
+==============================================================================
+vim:tw=78:ts=8:ft=help:norl:
diff --git a/main.go b/main.go
new file mode 100644
index 0000000000000000000000000000000000000000..8748cb32225b7c05ed2e1f02ebe85f0b7133a71d
--- /dev/null
+++ b/main.go
@@ -0,0 +1,93 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io/fs"
+	"log"
+	"net/http"
+	"os"
+	"os/signal"
+	"syscall"
+	"time"
+)
+
+const Version = "0.1.0"
+
+type loggingResponseWriter struct {
+	http.ResponseWriter
+	statusCode int
+}
+
+func (lrw *loggingResponseWriter) WriteHeader(code int) {
+	lrw.statusCode = code
+	lrw.ResponseWriter.WriteHeader(code)
+}
+
+func loggingMiddleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		start := time.Now()
+		lrw := &loggingResponseWriter{w, http.StatusOK}
+		next.ServeHTTP(lrw, r)
+		log.Printf("%s %s %d %v", r.Method, r.URL.Path, lrw.statusCode, time.Since(start))
+	})
+}
+
+var (
+	GlobalConfig *Config
+)
+
+func main() {
+	var showVersion bool
+	flag.StringVar(&ConfigPath, "c", "config.yaml", "custom config path")
+	flag.BoolVar(&showVersion, "v", false, "show version")
+	flag.Parse()
+
+	if showVersion {
+		fmt.Printf("bbgit version %s\n", Version)
+		return
+	}
+
+	var err error
+	GlobalConfig, err = loadConfig(ConfigPath)
+	if err != nil {
+		log.Fatalf("Error loading config: %v", err)
+	}
+
+	LoadCache()
+	StartSaver()
+
+	mux := http.NewServeMux()
+
+	staticSub, _ := fs.Sub(staticFS, "static")
+	mux.Handle("GET /static/", http.StripPrefix("/static/", http.FileServer(http.FS(staticSub))))
+	mux.HandleFunc("GET /{$}", homeHandler)
+	mux.HandleFunc("GET /r/{name}", repoHandler)
+	mux.HandleFunc("GET /r/{name}/readme", readmeHandler)
+	mux.HandleFunc("GET /r/{name}/license", licenseHandler)
+	mux.HandleFunc("GET /r/{name}/markers", markersHandler)
+	mux.HandleFunc("GET /r/{name}/commits.rss", repoCommitsRSSHandler)
+	mux.HandleFunc("GET /r/{name}/tags.rss", repoTagsRSSHandler)
+	mux.HandleFunc("GET /r/{name}/files/{path...}", filesHandler)
+	mux.HandleFunc("GET /r/{name}/blob/{path...}", blobHandler)
+	mux.HandleFunc("GET /r/{name}/raw/{path...}", rawHandler)
+	mux.HandleFunc("GET /r/{name}/archive/{path...}", archiveHandler)
+	mux.HandleFunc("GET /r/{name}/c/{hash}", commitHandler)
+	mux.HandleFunc("GET /r/{name}/c/{hash}/patch", patchHandler)
+
+	go func() {
+		sigChan := make(chan os.Signal, 1)
+		signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
+		<-sigChan
+		log.Println("Shutting down... saving cache.")
+		if err := SaveCache(); err != nil {
+			log.Printf("Error saving cache on shutdown: %v", err)
+		}
+		os.Exit(0)
+	}()
+
+	fmt.Printf("Server starting on :8080 (config: %s)...\n", ConfigPath)
+	if err := http.ListenAndServe(":8080", loggingMiddleware(mux)); err != nil {
+		log.Fatalf("Error starting server: %v", err)
+	}
+}
diff --git a/ollama.md b/ollama.md
new file mode 100644
index 0000000000000000000000000000000000000000..150479e6a079e892e786f2801bb8de5b50802100
--- /dev/null
+++ b/ollama.md
@@ -0,0 +1,1931 @@
+# API
+
+> Note: Ollama's API docs are moving to https://docs.ollama.com/api
+
+## Endpoints
+
+- [Generate a completion](#generate-a-completion)
+- [Generate a chat completion](#generate-a-chat-completion)
+- [Create a Model](#create-a-model)
+- [List Local Models](#list-local-models)
+- [Show Model Information](#show-model-information)
+- [Copy a Model](#copy-a-model)
+- [Delete a Model](#delete-a-model)
+- [Pull a Model](#pull-a-model)
+- [Push a Model](#push-a-model)
+- [Generate Embeddings](#generate-embeddings)
+- [List Running Models](#list-running-models)
+- [Version](#version)
+- [Experimental: Image Generation](#image-generation-experimental)
+
+## Conventions
+
+### Model names
+
+Model names follow a `model:tag` format, where `model` can have an optional namespace such as `example/model`. Some examples are `orca-mini:3b-q8_0` and `llama3:70b`. The tag is optional and, if not provided, will default to `latest`. The tag is used to identify a specific version.
+
+### Durations
+
+All durations are returned in nanoseconds.
+
+### Streaming responses
+
+Certain endpoints stream responses as JSON objects. Streaming can be disabled by providing `{"stream": false}` for these endpoints.
+
+## Generate a completion
+
+```
+POST /api/generate
+```
+
+Generate a response for a given prompt with a provided model. This is a streaming endpoint, so there will be a series of responses. The final response object will include statistics and additional data from the request.
+
+### Parameters
+
+- `model`: (required) the [model name](#model-names)
+- `prompt`: the prompt to generate a response for
+- `suffix`: the text after the model response
+- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
+- `think`: (for thinking models) should the model think before responding?
+
+Advanced parameters (optional):
+
+- `format`: the format to return a response in. Format can be `json` or a JSON schema
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
+- `system`: system message to (overrides what is defined in the `Modelfile`)
+- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
+- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
+- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
+
+Experimental image generation parameters (for image generation models only):
+
+> [!WARNING]
+> These parameters are experimental and may change in future versions.
+
+- `width`: width of the generated image in pixels
+- `height`: height of the generated image in pixels
+- `steps`: number of diffusion steps
+
+#### Structured outputs
+
+Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [structured outputs](#request-structured-outputs) example below.
+
+#### JSON mode
+
+Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
+
+> [!IMPORTANT]
+> It's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
+
+### Examples
+
+#### Generate request (Streaming)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama3.2",
+  "prompt": "Why is the sky blue?"
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-08-04T08:52:19.385406455-07:00",
+  "response": "The",
+  "done": false
+}
+```
+
+The final response in the stream also includes additional data about the generation:
+
+- `total_duration`: time spent generating the response
+- `load_duration`: time spent in nanoseconds loading the model
+- `prompt_eval_count`: number of tokens in the prompt
+- `prompt_eval_duration`: time spent in nanoseconds evaluating the prompt
+- `eval_count`: number of tokens in the response
+- `eval_duration`: time in nanoseconds spent generating the response
+- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
+- `response`: empty if the response was streamed, if not streamed, this will contain the full response
+
+To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration` \* `10^9`.
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-08-04T19:22:45.499127Z",
+  "response": "",
+  "done": true,
+  "context": [1, 2, 3],
+  "total_duration": 10706818083,
+  "load_duration": 6338219291,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 130079000,
+  "eval_count": 259,
+  "eval_duration": 4232710000
+}
+```
+
+#### Request (No streaming)
+
+##### Request
+
+A response can be received in one reply when streaming is off.
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama3.2",
+  "prompt": "Why is the sky blue?",
+  "stream": false
+}'
+```
+
+##### Response
+
+If `stream` is set to `false`, the response will be a single JSON object:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-08-04T19:22:45.499127Z",
+  "response": "The sky is blue because it is the color of the sky.",
+  "done": true,
+  "context": [1, 2, 3],
+  "total_duration": 5043500667,
+  "load_duration": 5025959,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 325953000,
+  "eval_count": 290,
+  "eval_duration": 4709213000
+}
+```
+
+#### Request (with suffix)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "codellama:code",
+  "prompt": "def compute_gcd(a, b):",
+  "suffix": "    return result",
+  "options": {
+    "temperature": 0
+  },
+  "stream": false
+}'
+```
+
+##### Response
+
+```json5
+{
+  "model": "codellama:code",
+  "created_at": "2024-07-22T20:47:51.147561Z",
+  "response": "\n  if a == 0:\n    return b\n  else:\n    return compute_gcd(b % a, a)\n\ndef compute_lcm(a, b):\n  result = (a * b) / compute_gcd(a, b)\n",
+  "done": true,
+  "done_reason": "stop",
+  "context": [...],
+  "total_duration": 1162761250,
+  "load_duration": 6683708,
+  "prompt_eval_count": 17,
+  "prompt_eval_duration": 201222000,
+  "eval_count": 63,
+  "eval_duration": 953997000
+}
+```
+
+#### Request (Structured outputs)
+
+##### Request
+
+```shell
+curl -X POST http://localhost:11434/api/generate -H "Content-Type: application/json" -d '{
+  "model": "llama3.1:8b",
+  "prompt": "Ollama is 22 years old and is busy saving the world. Respond using JSON",
+  "stream": false,
+  "format": {
+    "type": "object",
+    "properties": {
+      "age": {
+        "type": "integer"
+      },
+      "available": {
+        "type": "boolean"
+      }
+    },
+    "required": [
+      "age",
+      "available"
+    ]
+  }
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.1:8b",
+  "created_at": "2024-12-06T00:48:09.983619Z",
+  "response": "{\n  \"age\": 22,\n  \"available\": true\n}",
+  "done": true,
+  "done_reason": "stop",
+  "context": [1, 2, 3],
+  "total_duration": 1075509083,
+  "load_duration": 567678166,
+  "prompt_eval_count": 28,
+  "prompt_eval_duration": 236000000,
+  "eval_count": 16,
+  "eval_duration": 269000000
+}
+```
+
+#### Request (JSON mode)
+
+> [!IMPORTANT]
+> When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama3.2",
+  "prompt": "What color is the sky at different times of the day? Respond using JSON",
+  "format": "json",
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-11-09T21:07:55.186497Z",
+  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
+  "done": true,
+  "context": [1, 2, 3],
+  "total_duration": 4648158584,
+  "load_duration": 4071084,
+  "prompt_eval_count": 36,
+  "prompt_eval_duration": 439038000,
+  "eval_count": 180,
+  "eval_duration": 4196918000
+}
+```
+
+The value of `response` will be a string containing JSON similar to:
+
+```json
+{
+  "morning": {
+    "color": "blue"
+  },
+  "noon": {
+    "color": "blue-gray"
+  },
+  "afternoon": {
+    "color": "warm gray"
+  },
+  "evening": {
+    "color": "orange"
+  }
+}
+```
+
+#### Request (with images)
+
+To submit images to multimodal models such as `llava` or `bakllava`, provide a list of base64-encoded `images`:
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llava",
+  "prompt":"What is in this picture?",
+  "stream": false,
+  "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+}'
+```
+
+#### Response
+
+```json
+{
+  "model": "llava",
+  "created_at": "2023-11-03T15:36:02.583064Z",
+  "response": "A happy cartoon character, which is cute and cheerful.",
+  "done": true,
+  "context": [1, 2, 3],
+  "total_duration": 2938432250,
+  "load_duration": 2559292,
+  "prompt_eval_count": 1,
+  "prompt_eval_duration": 2195557000,
+  "eval_count": 44,
+  "eval_duration": 736432000
+}
+```
+
+#### Request (Raw Mode)
+
+In some cases, you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable templating. Also note that raw mode will not return a context.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "mistral",
+  "prompt": "[INST] why is the sky blue? [/INST]",
+  "raw": true,
+  "stream": false
+}'
+```
+
+#### Request (Reproducible outputs)
+
+For reproducible outputs, set `seed` to a number:
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "mistral",
+  "prompt": "Why is the sky blue?",
+  "options": {
+    "seed": 123
+  }
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "mistral",
+  "created_at": "2023-11-03T15:36:02.583064Z",
+  "response": " The sky appears blue because of a phenomenon called Rayleigh scattering.",
+  "done": true,
+  "total_duration": 8493852375,
+  "load_duration": 6589624375,
+  "prompt_eval_count": 14,
+  "prompt_eval_duration": 119039000,
+  "eval_count": 110,
+  "eval_duration": 1779061000
+}
+```
+
+#### Generate request (With options)
+
+If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama3.2",
+  "prompt": "Why is the sky blue?",
+  "stream": false,
+  "options": {
+    "num_keep": 5,
+    "seed": 42,
+    "num_predict": 100,
+    "top_k": 20,
+    "top_p": 0.9,
+    "min_p": 0.0,
+    "typical_p": 0.7,
+    "repeat_last_n": 33,
+    "temperature": 0.8,
+    "repeat_penalty": 1.2,
+    "presence_penalty": 1.5,
+    "frequency_penalty": 1.0,
+    "penalize_newline": true,
+    "stop": ["\n", "user:"],
+    "numa": false,
+    "num_ctx": 1024,
+    "num_batch": 2,
+    "num_gpu": 1,
+    "main_gpu": 0,
+    "use_mmap": true,
+    "num_thread": 8
+  }
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-08-04T19:22:45.499127Z",
+  "response": "The sky is blue because it is the color of the sky.",
+  "done": true,
+  "context": [1, 2, 3],
+  "total_duration": 4935886791,
+  "load_duration": 534986708,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 107345000,
+  "eval_count": 237,
+  "eval_duration": 4289432000
+}
+```
+
+#### Load a model
+
+If an empty prompt is provided, the model will be loaded into memory.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama3.2"
+}'
+```
+
+##### Response
+
+A single JSON object is returned:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-12-18T19:52:07.071755Z",
+  "response": "",
+  "done": true
+}
+```
+
+#### Unload a model
+
+If an empty prompt is provided and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama3.2",
+  "keep_alive": 0
+}'
+```
+
+##### Response
+
+A single JSON object is returned:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2024-09-12T03:54:03.516566Z",
+  "response": "",
+  "done": true,
+  "done_reason": "unload"
+}
+```
+
+## Generate a chat completion
+
+```
+POST /api/chat
+```
+
+Generate the next message in a chat with a provided model. This is a streaming endpoint, so there will be a series of responses. Streaming can be disabled using `"stream": false`. The final response object will include statistics and additional data from the request.
+
+### Parameters
+
+- `model`: (required) the [model name](#model-names)
+- `messages`: the messages of the chat, this can be used to keep a chat memory
+- `tools`: list of tools in JSON for the model to use if supported
+- `think`: (for thinking models) should the model think before responding?
+
+The `message` object has the following fields:
+
+- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
+- `content`: the content of the message
+- `thinking`: (for thinking models) the model's thinking process
+- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
+- `tool_calls` (optional): a list of tools in JSON that the model wants to use
+- `tool_name` (optional): add the name of the tool that was executed to inform the model of the result
+
+Advanced parameters (optional):
+
+- `format`: the format to return a response in. Format can be `json` or a JSON schema.
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
+- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+
+### Tool calling
+
+Tool calling is supported by providing a list of tools in the `tools` parameter. The model will generate a response that includes a list of tool calls. See the [Chat request (Streaming with tools)](#chat-request-streaming-with-tools) example below.
+
+Models can also explain the result of the tool call in the response. See the [Chat request (With history, with tools)](#chat-request-with-history-with-tools) example below.
+
+[See models with tool calling capabilities](https://ollama.com/search?c=tool).
+
+### Structured outputs
+
+Structured outputs are supported by providing a JSON schema in the `format` parameter. The model will generate a response that matches the schema. See the [Chat request (Structured outputs)](#chat-request-structured-outputs) example below.
+
+### Examples
+
+#### Chat request (Streaming)
+
+##### Request
+
+Send a chat message with a streaming response.
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "why is the sky blue?"
+    }
+  ]
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-08-04T08:52:19.385406455-07:00",
+  "message": {
+    "role": "assistant",
+    "content": "The",
+    "images": null
+  },
+  "done": false
+}
+```
+
+Final response:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-08-04T19:22:45.499127Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done": true,
+  "total_duration": 4883583458,
+  "load_duration": 1334875,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 342546000,
+  "eval_count": 282,
+  "eval_duration": 4535599000
+}
+```
+
+#### Chat request (Streaming with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in tokyo?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ],
+  "stream": true
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:22:19.184789Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_weather",
+          "arguments": {
+            "city": "Tokyo"
+          }
+        }
+      }
+    ]
+  },
+  "done": false
+}
+```
+
+Final response:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:22:19.19314Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 182242375,
+  "load_duration": 41295167,
+  "prompt_eval_count": 169,
+  "prompt_eval_duration": 24573166,
+  "eval_count": 15,
+  "eval_duration": 115959084
+}
+```
+
+#### Chat request (No streaming)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "why is the sky blue?"
+    }
+  ],
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-12-12T14:13:43.416799Z",
+  "message": {
+    "role": "assistant",
+    "content": "Hello! How are you today?"
+  },
+  "done": true,
+  "total_duration": 5191566416,
+  "load_duration": 2154458,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 383809000,
+  "eval_count": 298,
+  "eval_duration": 4799921000
+}
+```
+
+#### Chat request (No streaming, with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in tokyo?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ],
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:32:53.844124Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_weather",
+          "arguments": {
+            "city": "Tokyo"
+          }
+        }
+      }
+    ]
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 3244883583,
+  "load_duration": 2969184542,
+  "prompt_eval_count": 169,
+  "prompt_eval_duration": 141656333,
+  "eval_count": 18,
+  "eval_duration": 133293625
+}
+```
+
+#### Chat request (Structured outputs)
+
+##### Request
+
+```shell
+curl -X POST http://localhost:11434/api/chat -H "Content-Type: application/json" -d '{
+  "model": "llama3.1",
+  "messages": [{"role": "user", "content": "Ollama is 22 years old and busy saving the world. Return a JSON object with the age and availability."}],
+  "stream": false,
+  "format": {
+    "type": "object",
+    "properties": {
+      "age": {
+        "type": "integer"
+      },
+      "available": {
+        "type": "boolean"
+      }
+    },
+    "required": [
+      "age",
+      "available"
+    ]
+  },
+  "options": {
+    "temperature": 0
+  }
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.1",
+  "created_at": "2024-12-06T00:46:58.265747Z",
+  "message": {
+    "role": "assistant",
+    "content": "{\"age\": 22, \"available\": false}"
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 2254970291,
+  "load_duration": 574751416,
+  "prompt_eval_count": 34,
+  "prompt_eval_duration": 1502000000,
+  "eval_count": 12,
+  "eval_duration": 175000000
+}
+```
+
+#### Chat request (With History)
+
+Send a chat message with a conversation history. You can use this same approach to start the conversation using multi-shot or chain-of-thought prompting.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "why is the sky blue?"
+    },
+    {
+      "role": "assistant",
+      "content": "due to rayleigh scattering."
+    },
+    {
+      "role": "user",
+      "content": "how is that different than mie scattering?"
+    }
+  ]
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-08-04T08:52:19.385406455-07:00",
+  "message": {
+    "role": "assistant",
+    "content": "The"
+  },
+  "done": false
+}
+```
+
+Final response:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-08-04T19:22:45.499127Z",
+  "done": true,
+  "total_duration": 8113331500,
+  "load_duration": 6396458,
+  "prompt_eval_count": 61,
+  "prompt_eval_duration": 398801000,
+  "eval_count": 468,
+  "eval_duration": 7701267000
+}
+```
+
+#### Chat request (With history, with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is the weather in Toronto?"
+    },
+    // the message from the model appended to history
+    {
+      "role": "assistant",
+      "content": "",
+      "tool_calls": [
+        {
+          "function": {
+            "name": "get_weather",
+            "arguments": {
+              "city": "Toronto"
+            }
+          }
+        }
+      ]
+    },
+    // the tool call result appended to history
+    {
+      "role": "tool",
+      "content": "11 degrees celsius",
+      "tool_name": "get_weather"
+    }
+  ],
+  "stream": false,
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the weather in a given city",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "city": {
+              "type": "string",
+              "description": "The city to get the weather for"
+            }
+          },
+          "required": ["city"]
+        }
+      }
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2025-07-07T20:43:37.688511Z",
+  "message": {
+    "role": "assistant",
+    "content": "The current temperature in Toronto is 11°C."
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 890771750,
+  "load_duration": 707634750,
+  "prompt_eval_count": 94,
+  "prompt_eval_duration": 91703208,
+  "eval_count": 11,
+  "eval_duration": 90282125
+}
+```
+
+#### Chat request (with images)
+
+##### Request
+
+Send a chat message with images. The images should be provided as an array, with the individual images encoded in Base64.
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llava",
+  "messages": [
+    {
+      "role": "user",
+      "content": "what is in this image?",
+      "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"]
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llava",
+  "created_at": "2023-12-13T22:42:50.203334Z",
+  "message": {
+    "role": "assistant",
+    "content": " The image features a cute, little pig with an angry facial expression. It's wearing a heart on its shirt and is waving in the air. This scene appears to be part of a drawing or sketching project.",
+    "images": null
+  },
+  "done": true,
+  "total_duration": 1668506709,
+  "load_duration": 1986209,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 359682000,
+  "eval_count": 83,
+  "eval_duration": 1303285000
+}
+```
+
+#### Chat request (Reproducible outputs)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Hello!"
+    }
+  ],
+  "options": {
+    "seed": 101,
+    "temperature": 0
+  }
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2023-12-12T14:13:43.416799Z",
+  "message": {
+    "role": "assistant",
+    "content": "Hello! How are you today?"
+  },
+  "done": true,
+  "total_duration": 5191566416,
+  "load_duration": 2154458,
+  "prompt_eval_count": 26,
+  "prompt_eval_duration": 383809000,
+  "eval_count": 298,
+  "eval_duration": 4799921000
+}
+```
+
+#### Chat request (with tools)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the weather today in Paris?"
+    }
+  ],
+  "stream": false,
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather for a location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The location to get the weather for, e.g. San Francisco, CA"
+            },
+            "format": {
+              "type": "string",
+              "description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location", "format"]
+        }
+      }
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2024-07-22T20:33:28.123648Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_current_weather",
+          "arguments": {
+            "format": "celsius",
+            "location": "Paris, FR"
+          }
+        }
+      }
+    ]
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 885095291,
+  "load_duration": 3753500,
+  "prompt_eval_count": 122,
+  "prompt_eval_duration": 328493000,
+  "eval_count": 33,
+  "eval_duration": 552222000
+}
+```
+
+#### Load a model
+
+If the messages array is empty, the model will be loaded into memory.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": []
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2024-09-12T21:17:29.110811Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "load",
+  "done": true
+}
+```
+
+#### Unload a model
+
+If the messages array is empty and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.2",
+  "messages": [],
+  "keep_alive": 0
+}'
+```
+
+##### Response
+
+A single JSON object is returned:
+
+```json
+{
+  "model": "llama3.2",
+  "created_at": "2024-09-12T21:33:17.547535Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "unload",
+  "done": true
+}
+```
+
+## Create a Model
+
+```
+POST /api/create
+```
+
+Create a model from:
+
+- another model;
+- a safetensors directory; or
+- a GGUF file.
+
+If you are creating a model from a safetensors directory or from a GGUF file, you must [create a blob](#create-a-blob) for each of the files and then use the file name and SHA256 digest associated with each blob in the `files` field.
+
+### Parameters
+
+- `model`: name of the model to create
+- `from`: (optional) name of an existing model to create the new model from
+- `files`: (optional) a dictionary of file names to SHA256 digests of blobs to create the model from
+- `adapters`: (optional) a dictionary of file names to SHA256 digests of blobs for LORA adapters
+- `template`: (optional) the prompt template for the model
+- `license`: (optional) a string or list of strings containing the license or licenses for the model
+- `system`: (optional) a string containing the system prompt for the model
+- `parameters`: (optional) a dictionary of parameters for the model (see [Modelfile](./modelfile.mdx#valid-parameters-and-values) for a list of parameters)
+- `messages`: (optional) a list of message objects used to create a conversation
+- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
+- `quantize` (optional): quantize a non-quantized (e.g. float16) model
+
+#### Quantization types
+
+| Type   | Recommended |
+| ------ | :---------: |
+| q4_K_M |     \*      |
+| q4_K_S |             |
+| q8_0   |     \*      |
+
+### Examples
+
+#### Create a new model
+
+Create a new model from an existing model.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "mario",
+  "from": "llama3.2",
+  "system": "You are Mario from Super Mario Bros."
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{"status":"reading model metadata"}
+{"status":"creating system layer"}
+{"status":"using already created layer sha256:22f7f8ef5f4c791c1b03d7eb414399294764d7cc82c7e94aa81a1feb80a983a2"}
+{"status":"using already created layer sha256:8c17c2ebb0ea011be9981cc3922db8ca8fa61e828c5d3f44cb6ae342bf80460b"}
+{"status":"using already created layer sha256:7c23fb36d80141c4ab8cdbb61ee4790102ebd2bf7aeff414453177d4f2110e5d"}
+{"status":"using already created layer sha256:2e0493f67d0c8c9c68a8aeacdf6a38a2151cb3c4c1d42accf296e19810527988"}
+{"status":"using already created layer sha256:2759286baa875dc22de5394b4a925701b1896a7e3f8e53275c36f75a877a82c9"}
+{"status":"writing layer sha256:df30045fe90f0d750db82a058109cecd6d4de9c90a3d75b19c09e5f64580bb42"}
+{"status":"writing layer sha256:f18a68eb09bf925bb1b669490407c1b1251c5db98dc4d3d81f3088498ea55690"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+#### Quantize a model
+
+Quantize a non-quantized model.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "llama3.2:quantized",
+  "from": "llama3.2:3b-instruct-fp16",
+  "quantize": "q4_K_M"
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":12302}
+{"status":"quantizing F16 model to Q4_K_M","digest":"0","total":6433687776,"completed":6433687552}
+{"status":"verifying conversion"}
+{"status":"creating new layer sha256:fb7f4f211b89c6c4928ff4ddb73db9f9c0cfca3e000c3e40d6cf27ddc6ca72eb"}
+{"status":"using existing layer sha256:966de95ca8a62200913e3f8bfbf84c8494536f1b94b49166851e76644e966396"}
+{"status":"using existing layer sha256:fcc5a6bec9daf9b561a68827b67ab6088e1dba9d1fa2a50d7bbcc8384e0a265d"}
+{"status":"using existing layer sha256:a70ff7e570d97baaf4e62ac6e6ad9975e04caa6d900d3742d37698494479e0cd"}
+{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+#### Create a model from GGUF
+
+Create a model from a GGUF file. The `files` parameter should be filled out with the file name and SHA256 digest of the GGUF file you wish to use. Use [/api/blobs/:digest](#push-a-blob) to push the GGUF file to the server before calling this API.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "my-gguf-model",
+  "files": {
+    "test.gguf": "sha256:432f310a77f4650a88d0fd59ecdd7cebed8d684bafea53cbff0473542964f0c3"
+  }
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```json
+{"status":"parsing GGUF"}
+{"status":"using existing layer sha256:432f310a77f4650a88d0fd59ecdd7cebed8d684bafea53cbff0473542964f0c3"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+#### Create a model from a Safetensors directory
+
+The `files` parameter should include a dictionary of files for the safetensors model which includes the file names and SHA256 digest of each file. Use [/api/blobs/:digest](#push-a-blob) to first push each of the files to the server before calling this API. Files will remain in the cache until the Ollama server is restarted.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/create -d '{
+  "model": "fred",
+  "files": {
+    "config.json": "sha256:dd3443e529fb2290423a0c65c2d633e67b419d273f170259e27297219828e389",
+    "generation_config.json": "sha256:88effbb63300dbbc7390143fbbdd9d9fa50587b37e8bfd16c8c90d4970a74a36",
+    "special_tokens_map.json": "sha256:b7455f0e8f00539108837bfa586c4fbf424e31f8717819a6798be74bef813d05",
+    "tokenizer.json": "sha256:bbc1904d35169c542dffbe1f7589a5994ec7426d9e5b609d07bab876f32e97ab",
+    "tokenizer_config.json": "sha256:24e8a6dc2547164b7002e3125f10b415105644fcf02bf9ad8b674c87b1eaaed6",
+    "model.safetensors": "sha256:1ff795ff6a07e6a68085d206fb84417da2f083f68391c2843cd2b8ac6df8538f"
+  }
+}'
+```
+
+##### Response
+
+A stream of JSON objects is returned:
+
+```shell
+{"status":"converting model"}
+{"status":"creating new layer sha256:05ca5b813af4a53d2c2922933936e398958855c44ee534858fcfd830940618b6"}
+{"status":"using autodetected template llama3-instruct"}
+{"status":"using existing layer sha256:56bb8bd477a519ffa694fc449c2413c6f0e1d3b1c88fa7e3c9d88d3ae49d4dcb"}
+{"status":"writing manifest"}
+{"status":"success"}
+```
+
+## Check if a Blob Exists
+
+```shell
+HEAD /api/blobs/:digest
+```
+
+Ensures that the file blob (Binary Large Object) used with create a model exists on the server. This checks your Ollama server and not ollama.com.
+
+### Query Parameters
+
+- `digest`: the SHA256 digest of the blob
+
+### Examples
+
+#### Request
+
+```shell
+curl -I http://localhost:11434/api/blobs/sha256:29fdb92e57cf0827ded04ae6461b5931d01fa595843f55d36f5b275a52087dd2
+```
+
+#### Response
+
+Return 200 OK if the blob exists, 404 Not Found if it does not.
+
+## Push a Blob
+
+```
+POST /api/blobs/:digest
+```
+
+Push a file to the Ollama server to create a "blob" (Binary Large Object).
+
+### Query Parameters
+
+- `digest`: the expected SHA256 digest of the file
+
+### Examples
+
+#### Request
+
+```shell
+curl -T model.gguf -X POST http://localhost:11434/api/blobs/sha256:29fdb92e57cf0827ded04ae6461b5931d01fa595843f55d36f5b275a52087dd2
+```
+
+#### Response
+
+Return 201 Created if the blob was successfully created, 400 Bad Request if the digest used is not expected.
+
+## List Local Models
+
+```
+GET /api/tags
+```
+
+List models that are available locally.
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/tags
+```
+
+#### Response
+
+A single JSON object will be returned.
+
+```json
+{
+  "models": [
+    {
+      "name": "deepseek-r1:latest",
+      "model": "deepseek-r1:latest",
+      "modified_at": "2025-05-10T08:06:48.639712648-07:00",
+      "size": 4683075271,
+      "digest": "0a8c266910232fd3291e71e5ba1e058cc5af9d411192cf88b6d30e92b6e73163",
+      "details": {
+        "parent_model": "",
+        "format": "gguf",
+        "family": "qwen2",
+        "families": ["qwen2"],
+        "parameter_size": "7.6B",
+        "quantization_level": "Q4_K_M"
+      }
+    },
+    {
+      "name": "llama3.2:latest",
+      "model": "llama3.2:latest",
+      "modified_at": "2025-05-04T17:37:44.706015396-07:00",
+      "size": 2019393189,
+      "digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
+      "details": {
+        "parent_model": "",
+        "format": "gguf",
+        "family": "llama",
+        "families": ["llama"],
+        "parameter_size": "3.2B",
+        "quantization_level": "Q4_K_M"
+      }
+    }
+  ]
+}
+```
+
+## Show Model Information
+
+```
+POST /api/show
+```
+
+Show information about a model including details, modelfile, template, parameters, license, system prompt.
+
+### Parameters
+
+- `model`: name of the model to show
+- `verbose`: (optional) if set to `true`, returns full data for verbose response fields
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/show -d '{
+  "model": "llava"
+}'
+```
+
+#### Response
+
+```json5
+{
+  modelfile: '# Modelfile generated by "ollama show"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llava:latest\n\nFROM /Users/matt/.ollama/models/blobs/sha256:200765e1283640ffbd013184bf496e261032fa75b99498a9613be4e94d63ad52\nTEMPLATE """{{ .System }}\nUSER: {{ .Prompt }}\nASSISTANT: """\nPARAMETER num_ctx 4096\nPARAMETER stop "\u003c/s\u003e"\nPARAMETER stop "USER:"\nPARAMETER stop "ASSISTANT:"',
+  parameters: 'num_keep                       24\nstop                           "<|start_header_id|>"\nstop                           "<|end_header_id|>"\nstop                           "<|eot_id|>"',
+  template: "{{ if .System }}<|start_header_id|>system<|end_header_id|>\n\n{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n{{ .Response }}<|eot_id|>",
+  details: {
+    parent_model: "",
+    format: "gguf",
+    family: "llama",
+    families: ["llama"],
+    parameter_size: "8.0B",
+    quantization_level: "Q4_0",
+  },
+  model_info: {
+    "general.architecture": "llama",
+    "general.file_type": 2,
+    "general.parameter_count": 8030261248,
+    "general.quantization_version": 2,
+    "llama.attention.head_count": 32,
+    "llama.attention.head_count_kv": 8,
+    "llama.attention.layer_norm_rms_epsilon": 0.00001,
+    "llama.block_count": 32,
+    "llama.context_length": 8192,
+    "llama.embedding_length": 4096,
+    "llama.feed_forward_length": 14336,
+    "llama.rope.dimension_count": 128,
+    "llama.rope.freq_base": 500000,
+    "llama.vocab_size": 128256,
+    "tokenizer.ggml.bos_token_id": 128000,
+    "tokenizer.ggml.eos_token_id": 128009,
+    "tokenizer.ggml.merges": [], // populates if `verbose=true`
+    "tokenizer.ggml.model": "gpt2",
+    "tokenizer.ggml.pre": "llama-bpe",
+    "tokenizer.ggml.token_type": [], // populates if `verbose=true`
+    "tokenizer.ggml.tokens": [], // populates if `verbose=true`
+  },
+  capabilities: ["completion", "vision"],
+}
+```
+
+## Copy a Model
+
+```
+POST /api/copy
+```
+
+Copy a model. Creates a model with another name from an existing model.
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/copy -d '{
+  "source": "llama3.2",
+  "destination": "llama3-backup"
+}'
+```
+
+#### Response
+
+Returns a 200 OK if successful, or a 404 Not Found if the source model doesn't exist.
+
+## Delete a Model
+
+```
+DELETE /api/delete
+```
+
+Delete a model and its data.
+
+### Parameters
+
+- `model`: model name to delete
+
+### Examples
+
+#### Request
+
+```shell
+curl -X DELETE http://localhost:11434/api/delete -d '{
+  "model": "llama3:13b"
+}'
+```
+
+#### Response
+
+Returns a 200 OK if successful, 404 Not Found if the model to be deleted doesn't exist.
+
+## Pull a Model
+
+```
+POST /api/pull
+```
+
+Download a model from the ollama library. Cancelled pulls are resumed from where they left off, and multiple calls will share the same download progress.
+
+### Parameters
+
+- `model`: name of the model to pull
+- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
+- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/pull -d '{
+  "model": "llama3.2"
+}'
+```
+
+#### Response
+
+If `stream` is not specified, or set to `true`, a stream of JSON objects is returned:
+
+The first object is the manifest:
+
+```json
+{
+  "status": "pulling manifest"
+}
+```
+
+Then there is a series of downloading responses. Until any of the download is completed, the `completed` key may not be included. The number of files to be downloaded depends on the number of layers specified in the manifest.
+
+```json
+{
+  "status": "pulling digestname",
+  "digest": "digestname",
+  "total": 2142590208,
+  "completed": 241970
+}
+```
+
+After all the files are downloaded, the final responses are:
+
+```json
+{
+    "status": "verifying sha256 digest"
+}
+{
+    "status": "writing manifest"
+}
+{
+    "status": "removing any unused layers"
+}
+{
+    "status": "success"
+}
+```
+
+if `stream` is set to false, then the response is a single JSON object:
+
+```json
+{
+  "status": "success"
+}
+```
+
+## Push a Model
+
+```
+POST /api/push
+```
+
+Upload a model to a model library. Requires registering for ollama.ai and adding a public key first.
+
+### Parameters
+
+- `model`: name of the model to push in the form of `<namespace>/<model>:<tag>`
+- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
+- `stream`: (optional) if `false` the response will be returned as a single response object, rather than a stream of objects
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/push -d '{
+  "model": "mattw/pygmalion:latest"
+}'
+```
+
+#### Response
+
+If `stream` is not specified, or set to `true`, a stream of JSON objects is returned:
+
+```json
+{ "status": "retrieving manifest" }
+```
+
+and then:
+
+```json
+{
+  "status": "starting upload",
+  "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+  "total": 1928429856
+}
+```
+
+Then there is a series of uploading responses:
+
+```json
+{
+  "status": "starting upload",
+  "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+  "total": 1928429856
+}
+```
+
+Finally, when the upload is complete:
+
+```json
+{"status":"pushing manifest"}
+{"status":"success"}
+```
+
+If `stream` is set to `false`, then the response is a single JSON object:
+
+```json
+{ "status": "success" }
+```
+
+## Generate Embeddings
+
+```
+POST /api/embed
+```
+
+Generate embeddings from a model
+
+### Parameters
+
+- `model`: name of model to generate embeddings from
+- `input`: text or list of text to generate embeddings for
+
+Advanced parameters:
+
+- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `dimensions`: number of dimensions for the embedding
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/embed -d '{
+  "model": "all-minilm",
+  "input": "Why is the sky blue?"
+}'
+```
+
+#### Response
+
+```json
+{
+  "model": "all-minilm",
+  "embeddings": [
+    [
+      0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+      0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+    ]
+  ],
+  "total_duration": 14143917,
+  "load_duration": 1019500,
+  "prompt_eval_count": 8
+}
+```
+
+#### Request (Multiple input)
+
+```shell
+curl http://localhost:11434/api/embed -d '{
+  "model": "all-minilm",
+  "input": ["Why is the sky blue?", "Why is the grass green?"]
+}'
+```
+
+#### Response
+
+```json
+{
+  "model": "all-minilm",
+  "embeddings": [
+    [
+      0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
+      0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
+    ],
+    [
+      -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
+      0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
+    ]
+  ]
+}
+```
+
+## List Running Models
+
+```
+GET /api/ps
+```
+
+List models that are currently loaded into memory.
+
+#### Examples
+
+### Request
+
+```shell
+curl http://localhost:11434/api/ps
+```
+
+#### Response
+
+A single JSON object will be returned.
+
+```json
+{
+  "models": [
+    {
+      "name": "mistral:latest",
+      "model": "mistral:latest",
+      "size": 5137025024,
+      "digest": "2ae6f6dd7a3dd734790bbbf58b8909a606e0e7e97e94b7604e0aa7ae4490e6d8",
+      "details": {
+        "parent_model": "",
+        "format": "gguf",
+        "family": "llama",
+        "families": ["llama"],
+        "parameter_size": "7.2B",
+        "quantization_level": "Q4_0"
+      },
+      "expires_at": "2024-06-04T14:38:31.83753-07:00",
+      "size_vram": 5137025024
+    }
+  ]
+}
+```
+
+## Generate Embedding
+
+> Note: this endpoint has been superseded by `/api/embed`
+
+```
+POST /api/embeddings
+```
+
+Generate embeddings from a model
+
+### Parameters
+
+- `model`: name of model to generate embeddings from
+- `prompt`: text to generate embeddings for
+
+Advanced parameters:
+
+- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.mdx#valid-parameters-and-values) such as `temperature`
+- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/embeddings -d '{
+  "model": "all-minilm",
+  "prompt": "Here is an article about llamas..."
+}'
+```
+
+#### Response
+
+```json
+{
+  "embedding": [
+    0.5670403838157654, 0.009260174818336964, 0.23178744316101074,
+    -0.2916173040866852, -0.8924556970596313, 0.8785552978515625,
+    -0.34576427936553955, 0.5742510557174683, -0.04222835972905159,
+    -0.137906014919281
+  ]
+}
+```
+
+## Version
+
+```
+GET /api/version
+```
+
+Retrieve the Ollama version
+
+### Examples
+
+#### Request
+
+```shell
+curl http://localhost:11434/api/version
+```
+
+#### Response
+
+```json
+{
+  "version": "0.5.1"
+}
+```
+
+## Experimental Features
+
+### Image Generation (Experimental)
+
+> [!WARNING]
+> Image generation is experimental and may change in future versions.
+
+Image generation is now supported through the standard `/api/generate` endpoint when using image generation models. The API automatically detects when an image generation model is being used.
+
+See the [Generate a completion](#generate-a-completion) section for the full API documentation. The experimental image generation parameters (`width`, `height`, `steps`) are documented there.
+
+#### Example
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "x/z-image-turbo",
+  "prompt": "a sunset over mountains",
+  "width": 1024,
+  "height": 768
+}'
+```
+
+##### Response (streaming)
+
+Progress updates during generation:
+
+```json
+{
+  "model": "x/z-image-turbo",
+  "created_at": "2024-01-15T10:30:00.000000Z",
+  "completed": 5,
+  "total": 20,
+  "done": false
+}
+```
+
+##### Final Response
+
+```json
+{
+  "model": "x/z-image-turbo",
+  "created_at": "2024-01-15T10:30:15.000000Z",
+  "image": "iVBORw0KGgoAAAANSUhEUg...",
+  "done": true,
+  "done_reason": "stop",
+  "total_duration": 15000000000,
+  "load_duration": 2000000000
+}
+```
diff --git a/plugin/llm.vim b/plugin/llm.vim
new file mode 100644
index 0000000000000000000000000000000000000000..08108d51731c6f438b7e6f1ce3fe6836264084d9
--- /dev/null
+++ b/plugin/llm.vim
@@ -0,0 +1,12 @@
+if exists('g:loaded_llm')
+	finish
+endif
+let g:loaded_llm = 1
+
+if !get(g:, 'llm_disable_mappings', 0)
+	xnoremap <leader>l :<C-u>call llm#RequestCompletion(1)<CR>
+	nnoremap <leader>k :call llm#AskQuestion(0)<CR>
+	xnoremap <leader>k :<C-u>call llm#AskQuestion(1)<CR>
+endif
+
+command! -nargs=? -complete=customlist,llm#CompleteModel LLMModel call llm#SwitchModel(<q-args>)