1" LLM-based text completion using llama.cpp
2"
3" requires:
4"
5" - neovim or vim
6" - curl
7" - llama.cpp server instance
8" - FIM-compatible model
9"
10" sample config:
11"
12" - Tab - accept the current suggestion
13" - Shift+Tab - accept just the first line of the suggestion
14" - Ctrl+F - toggle FIM completion manually
15"
16" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
17"
18" start the llama.cpp server with a FIM-compatible model. for example:
19"
20" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
21"
22" --batch-size [512, model max context]
23"
24" adjust the batch size to control how much of the provided local context will be used during the inference
25" lower values will use smaller part of the context around the cursor, which will result in faster processing
26"
27" --ubatch-size [64, 2048]
28"
29" chunks the batch into smaller chunks for faster processing
30" depends on the specific hardware. use llama-bench to profile and determine the best size
31"
32" --cache-reuse (ge:llama_config.n_predict, 1024]
33"
34" this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
35" using non-zero value enables context reuse on the server side which dramatically improves the performance at
36" large contexts. a value of 256 should be good for all cases
37"
38" run this once to initialise llama.vim:
39"
40" :call llama#init()
41"
42" more info: https://github.com/ggml-org/llama.cpp/pull/9787
43"
44
45" colors (adjust to your liking)
46highlight llama_hl_hint guifg=#ff772f ctermfg=202
47highlight llama_hl_info guifg=#77ff2f ctermfg=119
48
49" general parameters:
50"
51" endpoint: llama.cpp server endpoint
52" n_prefix: number of lines before the cursor location to include in the local prefix
53" n_suffix: number of lines after the cursor location to include in the local suffix
54" n_predict: max number of tokens to predict
55" t_max_prompt_ms: max alloted time for the prompt processing (TODO: not yet supported)
56" t_max_predict_ms: max alloted time for the prediction
57" show_info: show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
58" auto_fim: trigger FIM completion automatically on cursor movement
59" max_line_suffix: do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
60"
61" ring buffer of chunks, accumulated with time upon:
62"
63" - completion request
64" - yank
65" - entering a buffer
66" - leaving a buffer
67" - writing a file
68"
69" parameters for the ring-buffer with extra context:
70"
71" ring_n_chunks: max number of chunks to pass as extra context to the server (0 to disable)
72" ring_chunk_size: max size of the chunks (in number of lines)
73" note: adjust these numbers so that you don't overrun your context
74" at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
75" ring_scope: the range around the cursor position (in number of lines) for gathering chunks after FIM
76" ring_update_ms: how often to process queued chunks in normal mode
77"
78let s:default_config = {
79 \ 'endpoint': 'http://127.0.0.1:8012/infill',
80 \ 'n_prefix': 256,
81 \ 'n_suffix': 64,
82 \ 'n_predict': 128,
83 \ 't_max_prompt_ms': 500,
84 \ 't_max_predict_ms': 3000,
85 \ 'show_info': 2,
86 \ 'auto_fim': v:true,
87 \ 'max_line_suffix': 8,
88 \ 'ring_n_chunks': 64,
89 \ 'ring_chunk_size': 64,
90 \ 'ring_scope': 1024,
91 \ 'ring_update_ms': 1000,
92 \ }
93
94let g:llama_config = get(g:, 'llama_config', s:default_config)
95
96function! s:get_indent(str)
97 let l:count = 0
98 for i in range(len(a:str))
99 if a:str[i] == "\t"
100 let l:count += &tabstop - 1
101 else
102 break
103 endif
104 endfor
105 return l:count
106endfunction
107
108function! s:rand(i0, i1) abort
109 return a:i0 + rand() % (a:i1 - a:i0 + 1)
110endfunction
111
112function! llama#init()
113 if !executable('curl')
114 echohl WarningMsg
115 echo 'llama.vim requires the "curl" command to be available'
116 echohl None
117 return
118 endif
119
120 let s:pos_x = 0 " cursor position upon start of completion
121 let s:pos_y = 0
122
123 let s:line_cur = ''
124
125 let s:line_cur_prefix = ''
126 let s:line_cur_suffix = ''
127
128 let s:ring_chunks = [] " current set of chunks used as extra context
129 let s:ring_queued = [] " chunks that are queued to be sent for processing
130 let s:ring_n_evict = 0
131
132 let s:hint_shown = v:false
133 let s:pos_y_pick = -9999 " last y where we picked a chunk
134 let s:pos_dx = 0
135 let s:content = []
136 let s:can_accept = v:false
137
138 let s:timer_fim = -1
139 let s:t_fim_start = reltime() " used to measure total FIM time
140 let s:t_last_move = reltime() " last time the cursor moved
141
142 let s:current_job = v:null
143
144 let s:ghost_text_nvim = exists('*nvim_buf_get_mark')
145 let s:ghost_text_vim = has('textprop')
146
147 if s:ghost_text_vim
148 let s:hlgroup_hint = 'llama_hl_hint'
149 let s:hlgroup_info = 'llama_hl_info'
150
151 if empty(prop_type_get(s:hlgroup_hint))
152 call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
153 endif
154 if empty(prop_type_get(s:hlgroup_info))
155 call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info})
156 endif
157 endif
158
159 augroup llama
160 autocmd!
161 autocmd InsertEnter * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
162 autocmd InsertLeavePre * call llama#fim_cancel()
163
164 autocmd CursorMoved * call s:on_move()
165 autocmd CursorMovedI * call s:on_move()
166 autocmd CompleteChanged * call llama#fim_cancel()
167
168 if g:llama_config.auto_fim
169 autocmd CursorMovedI * call llama#fim(v:true)
170 endif
171
172 " gather chunks upon yanking
173 autocmd TextYankPost * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
174
175 " gather chunks upon entering/leaving a buffer
176 autocmd BufEnter * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
177 autocmd BufLeave * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
178
179 " gather chunk upon saving the file
180 autocmd BufWritePost * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
181 augroup END
182
183 silent! call llama#fim_cancel()
184
185 " init background update of the ring buffer
186 if g:llama_config.ring_n_chunks > 0
187 call s:ring_update()
188 endif
189endfunction
190
191" compute how similar two chunks of text are
192" 0 - no similarity, 1 - high similarity
193" TODO: figure out something better
194function! s:chunk_sim(c0, c1)
195 let l:lines0 = len(a:c0)
196 let l:lines1 = len(a:c1)
197
198 let l:common = 0
199
200 for l:line0 in a:c0
201 for l:line1 in a:c1
202 if l:line0 == l:line1
203 let l:common += 1
204 break
205 endif
206 endfor
207 endfor
208
209 return 2.0 * l:common / (l:lines0 + l:lines1)
210endfunction
211
212" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
213"
214" no_mod - do not pick chunks from buffers with pending changes
215" do_evict - evict chunks that are very similar to the new one
216"
217function! s:pick_chunk(text, no_mod, do_evict)
218 " do not pick chunks from buffers with pending changes or buffers that are not files
219 if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
220 return
221 endif
222
223 " if the extra context option is disabled - do nothing
224 if g:llama_config.ring_n_chunks <= 0
225 return
226 endif
227
228 " don't pick very small chunks
229 if len(a:text) < 3
230 return
231 endif
232
233 if len(a:text) + 1 < g:llama_config.ring_chunk_size
234 let l:chunk = a:text
235 else
236 let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
237 let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
238
239 let l:chunk = a:text[l:l0:l:l1]
240 endif
241
242 let l:chunk_str = join(l:chunk, "\n") . "\n"
243
244 " check if this chunk is already added
245 let l:exist = v:false
246
247 for i in range(len(s:ring_chunks))
248 if s:ring_chunks[i].data == l:chunk
249 let l:exist = v:true
250 break
251 endif
252 endfor
253
254 for i in range(len(s:ring_queued))
255 if s:ring_queued[i].data == l:chunk
256 let l:exist = v:true
257 break
258 endif
259 endfor
260
261 if l:exist
262 return
263 endif
264
265 " evict queued chunks that are very similar to the new one
266 for i in range(len(s:ring_queued) - 1, 0, -1)
267 if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
268 if a:do_evict
269 call remove(s:ring_queued, i)
270 let s:ring_n_evict += 1
271 else
272 return
273 endif
274 endif
275 endfor
276
277 " also from s:ring_chunks
278 for i in range(len(s:ring_chunks) - 1, 0, -1)
279 if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
280 if a:do_evict
281 call remove(s:ring_chunks, i)
282 let s:ring_n_evict += 1
283 else
284 return
285 endif
286 endif
287 endfor
288
289 " TODO: become parameter ?
290 if len(s:ring_queued) == 16
291 call remove(s:ring_queued, 0)
292 endif
293
294 call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
295
296 "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
297endfunction
298
299" picks a queued chunk, sends it for processing and adds it to s:ring_chunks
300" called every g:llama_config.ring_update_ms
301function! s:ring_update()
302 call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
303
304 " update only if in normal mode or if the cursor hasn't moved for a while
305 if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
306 return
307 endif
308
309 if len(s:ring_queued) == 0
310 return
311 endif
312
313 " move the first queued chunk to the ring buffer
314 if len(s:ring_chunks) == g:llama_config.ring_n_chunks
315 call remove(s:ring_chunks, 0)
316 endif
317
318 call add(s:ring_chunks, remove(s:ring_queued, 0))
319
320 "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
321
322 " send asynchronous job with the new extra context so that it is ready for the next FIM
323 let l:extra_context = []
324 for l:chunk in s:ring_chunks
325 call add(l:extra_context, {
326 \ 'text': l:chunk.str,
327 \ 'time': l:chunk.time,
328 \ 'filename': l:chunk.filename
329 \ })
330 endfor
331
332 " no samplers needed here
333 let l:request = json_encode({
334 \ 'input_prefix': "",
335 \ 'input_suffix': "",
336 \ 'input_extra': l:extra_context,
337 \ 'prompt': "",
338 \ 'n_predict': 1,
339 \ 'temperature': 0.0,
340 \ 'stream': v:false,
341 \ 'samplers': ["temperature"],
342 \ 'cache_prompt': v:true,
343 \ 't_max_prompt_ms': 1,
344 \ 't_max_predict_ms': 1
345 \ })
346
347 let l:curl_command = [
348 \ "curl",
349 \ "--silent",
350 \ "--no-buffer",
351 \ "--request", "POST",
352 \ "--url", g:llama_config.endpoint,
353 \ "--header", "Content-Type: application/json",
354 \ "--data", l:request
355 \ ]
356
357 " no callbacks because we don't need to process the response
358 if s:ghost_text_nvim
359 call jobstart(l:curl_command, {})
360 elseif s:ghost_text_vim
361 call job_start(l:curl_command, {})
362 endif
363endfunction
364
365" necessary for 'inoremap <expr>'
366function! llama#fim_inline(is_auto) abort
367 call llama#fim(a:is_auto)
368 return ''
369endfunction
370
371" the main FIM call
372" takes local context around the cursor and sends it together with the extra context to the server for completion
373function! llama#fim(is_auto) abort
374 " we already have a suggestion for the current cursor position
375 if s:hint_shown && !a:is_auto
376 call llama#fim_cancel()
377 return
378 endif
379
380 call llama#fim_cancel()
381
382 " avoid sending repeated requests too fast
383 if reltimefloat(reltime(s:t_fim_start)) < 0.6
384 if s:timer_fim != -1
385 call timer_stop(s:timer_fim)
386 let s:timer_fim = -1
387 endif
388
389 let s:t_fim_start = reltime()
390 let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
391 return
392 endif
393
394 let s:t_fim_start = reltime()
395
396 let s:content = []
397 let s:can_accept = v:false
398
399 let s:pos_x = col('.') - 1
400 let s:pos_y = line('.')
401 let l:max_y = line('$')
402
403 let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
404 let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
405
406 let s:line_cur = getline('.')
407
408 let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
409 let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
410
411 if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
412 return
413 endif
414
415 let l:prefix = ""
416 \ . join(l:lines_prefix, "\n")
417 \ . "\n"
418
419 let l:prompt = ""
420 \ . s:line_cur_prefix
421
422 let l:suffix = ""
423 \ . s:line_cur_suffix
424 \ . "\n"
425 \ . join(l:lines_suffix, "\n")
426 \ . "\n"
427
428 " prepare the extra context data
429 let l:extra_context = []
430 for l:chunk in s:ring_chunks
431 call add(l:extra_context, {
432 \ 'text': l:chunk.str,
433 \ 'time': l:chunk.time,
434 \ 'filename': l:chunk.filename
435 \ })
436 endfor
437
438 " the indentation of the current line
439 let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
440
441 let l:request = json_encode({
442 \ 'input_prefix': l:prefix,
443 \ 'input_suffix': l:suffix,
444 \ 'input_extra': l:extra_context,
445 \ 'prompt': l:prompt,
446 \ 'n_predict': g:llama_config.n_predict,
447 \ 'n_indent': l:indent,
448 \ 'top_k': 40,
449 \ 'top_p': 0.99,
450 \ 'stream': v:false,
451 \ 'samplers': ["top_k", "top_p", "infill"],
452 \ 'cache_prompt': v:true,
453 \ 't_max_prompt_ms': g:llama_config.t_max_prompt_ms,
454 \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
455 \ })
456
457 let l:curl_command = [
458 \ "curl",
459 \ "--silent",
460 \ "--no-buffer",
461 \ "--request", "POST",
462 \ "--url", g:llama_config.endpoint,
463 \ "--header", "Content-Type: application/json",
464 \ "--data", l:request
465 \ ]
466
467 if s:current_job != v:null
468 if s:ghost_text_nvim
469 call jobstop(s:current_job)
470 elseif s:ghost_text_vim
471 call job_stop(s:current_job)
472 endif
473 endif
474
475 " send the request asynchronously
476 if s:ghost_text_nvim
477 let s:current_job = jobstart(l:curl_command, {
478 \ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
479 \ 'on_exit': function('s:fim_on_exit'),
480 \ 'stdout_buffered': v:true
481 \ })
482 elseif s:ghost_text_vim
483 let s:current_job = job_start(l:curl_command, {
484 \ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
485 \ 'exit_cb': function('s:fim_on_exit')
486 \ })
487 endif
488
489 " TODO: per-file location
490 let l:delta_y = abs(s:pos_y - s:pos_y_pick)
491
492 " gather some extra context nearby and process it in the background
493 " only gather chunks if the cursor has moved a lot
494 " TODO: something more clever? reranking?
495 if a:is_auto && l:delta_y > 32
496 " expand the prefix even further
497 call s:pick_chunk(getline(max([1, s:pos_y - g:llama_config.ring_scope]), max([1, s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
498
499 " pick a suffix chunk
500 call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]), min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
501
502 let s:pos_y_pick = s:pos_y
503 endif
504endfunction
505
506" if first_line == v:true accept only the first line of the response
507function! llama#fim_accept(first_line)
508 " insert the suggestion at the cursor location
509 if s:can_accept && len(s:content) > 0
510 call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
511 if len(s:content) > 1
512 if !a:first_line
513 call append(s:pos_y, s:content[1:-1])
514 endif
515 endif
516
517 " move the cursor to the end of the accepted text
518 if !a:first_line && len(s:content) > 1
519 call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
520 else
521 call cursor(s:pos_y, s:pos_x + len(s:content[0]))
522 endif
523 endif
524
525 call llama#fim_cancel()
526endfunction
527
528function! llama#fim_cancel()
529 let s:hint_shown = v:false
530
531 " clear the virtual text
532 let l:bufnr = bufnr('%')
533
534 if s:ghost_text_nvim
535 let l:id_vt_fim = nvim_create_namespace('vt_fim')
536 call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim, 0, -1)
537 elseif s:ghost_text_vim
538 call prop_remove({'type': s:hlgroup_hint, 'all': v:true})
539 call prop_remove({'type': s:hlgroup_info, 'all': v:true})
540 endif
541
542 " remove the mappings
543 silent! iunmap <buffer> <Tab>
544 silent! iunmap <buffer> <S-Tab>
545 silent! iunmap <buffer> <Esc>
546endfunction
547
548function! s:on_move()
549 let s:t_last_move = reltime()
550
551 call llama#fim_cancel()
552endfunction
553
554" callback that processes the FIM result from the server and displays the suggestion
555function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
556 if s:ghost_text_nvim
557 let l:raw = join(a:data, "\n")
558 elseif s:ghost_text_vim
559 let l:raw = a:data
560 endif
561
562 if len(l:raw) == 0
563 return
564 endif
565
566 if a:pos_x != col('.') - 1 || a:pos_y != line('.')
567 return
568 endif
569
570 " show the suggestion only in insert mode
571 if mode() !=# 'i'
572 return
573 endif
574
575 let s:pos_x = a:pos_x
576 let s:pos_y = a:pos_y
577
578 let s:can_accept = v:true
579 let l:has_info = v:false
580
581 if s:can_accept && v:shell_error
582 if !a:is_auto
583 call add(s:content, "<| curl error: is the server on? |>")
584 endif
585 let s:can_accept = v:false
586 endif
587
588 let l:n_prompt = 0
589 let l:t_prompt_ms = 1.0
590 let l:s_prompt = 0
591
592 let l:n_predict = 0
593 let l:t_predict_ms = 1.0
594 let l:s_predict = 0
595
596 " get the generated suggestion
597 if s:can_accept
598 let l:response = json_decode(l:raw)
599
600 for l:part in split(get(l:response, 'content', ''), "\n", 1)
601 call add(s:content, l:part)
602 endfor
603
604 " remove trailing new lines
605 while len(s:content) > 0 && s:content[-1] == ""
606 call remove(s:content, -1)
607 endwhile
608
609 let l:generation_settings = get(l:response, 'generation_settings', {})
610 let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
611
612 let l:n_cached = get(l:response, 'tokens_cached', 0)
613 let l:truncated = get(l:response, 'truncated', v:false)
614
615 " if response.timings is available
616 if len(get(l:response, 'timings', {})) > 0
617 let l:has_info = v:true
618 let l:timings = get(l:response, 'timings', {})
619
620 let l:n_prompt = get(l:timings, 'prompt_n', 0)
621 let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
622 let l:s_prompt = get(l:timings, 'prompt_per_second', 0)
623
624 let l:n_predict = get(l:timings, 'predicted_n', 0)
625 let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
626 let l:s_predict = get(l:timings, 'predicted_per_second', 0)
627 endif
628 endif
629
630 if len(s:content) == 0
631 call add(s:content, "")
632 let s:can_accept = v:false
633 endif
634
635 if len(s:content) == 0
636 return
637 endif
638
639 " NOTE: the following is logic for discarding predictions that repeat existing text
640 " the code is quite ugly and there is very likely a simpler and more canonical way to implement this
641 "
642 " still, I wonder if there is some better way that avoids having to do these special hacks?
643 " on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
644 " start generating whatever we have given it via the extra context. but on the other hand, it's not very
645 " helpful to re-generate the same code that is already there
646
647 " truncate the suggestion if the first line is empty
648 if len(s:content) == 1 && s:content[0] == ""
649 let s:content = [""]
650 endif
651
652 " ... and the next lines are repeated
653 if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
654 let s:content = [""]
655 endif
656
657 " truncate the suggestion if it repeats the suffix
658 if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
659 let s:content = [""]
660 endif
661
662 " find the first non-empty line (strip whitespace)
663 let l:cmp_y = s:pos_y + 1
664 while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
665 let l:cmp_y += 1
666 endwhile
667
668 if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
669 " truncate the suggestion if it repeats the next line
670 if len(s:content) == 1
671 let s:content = [""]
672 endif
673
674 " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
675 if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
676 let s:content = [""]
677 endif
678
679 " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
680 if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
681 let s:content = [""]
682 endif
683 endif
684
685 " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
686 "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
687 "for i in range(1, len(s:content) - 1)
688 " if strlen(matchstr(s:content[i], '^\s*')) < l:indent
689 " let s:content = s:content[:i - 1]
690 " break
691 " endif
692 "endfor
693
694 let s:pos_dx = len(s:content[-1])
695
696 let s:content[-1] .= s:line_cur_suffix
697
698 call llama#fim_cancel()
699
700 " display virtual text with the suggestion
701 let l:bufnr = bufnr('%')
702
703 if s:ghost_text_nvim
704 let l:id_vt_fim = nvim_create_namespace('vt_fim')
705 endif
706
707 " construct the info message
708 if g:llama_config.show_info > 0 && l:has_info
709 let l:prefix = ' '
710
711 if l:truncated
712 let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
713 \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
714 \ l:n_cached, l:n_ctx
715 \ )
716 else
717 let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
718 \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
719 \ l:n_cached, l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
720 \ l:n_prompt, l:t_prompt_ms, l:s_prompt,
721 \ l:n_predict, l:t_predict_ms, l:s_predict,
722 \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
723 \ )
724 endif
725
726 if g:llama_config.show_info == 1
727 " display the info in the statusline
728 let &statusline = l:info
729 let l:info = ''
730 endif
731 endif
732
733 " display the suggestion and append the info to the end of the first line
734 if s:ghost_text_nvim
735 call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
736 \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
737 \ 'virt_text_win_col': virtcol('.') - 1
738 \ })
739
740 call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
741 \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
742 \ 'virt_text_win_col': virtcol('.')
743 \ })
744 elseif s:ghost_text_vim
745 let l:new_suffix = s:content[0]
746 if !empty(l:new_suffix)
747 call prop_add(s:pos_y, s:pos_x + 1, {
748 \ 'type': s:hlgroup_hint,
749 \ 'text': l:new_suffix
750 \ })
751 endif
752 for line in s:content[1:]
753 call prop_add(s:pos_y, 0, {
754 \ 'type': s:hlgroup_hint,
755 \ 'text': line,
756 \ 'text_padding_left': s:get_indent(line),
757 \ 'text_align': 'below'
758 \ })
759 endfor
760 if !empty(l:info)
761 call prop_add(s:pos_y, 0, {
762 \ 'type': s:hlgroup_info,
763 \ 'text': l:info,
764 \ 'text_padding_left': col('$'),
765 \ 'text_wrap': 'truncate'
766 \ })
767 endif
768 endif
769
770 " setup accept shortcuts
771 inoremap <buffer> <Tab> <C-O>:call llama#fim_accept(v:false)<CR>
772 inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
773
774 let s:hint_shown = v:true
775endfunction
776
777function! s:fim_on_exit(job_id, exit_code, event = v:null)
778 if a:exit_code != 0
779 echom "Job failed with exit code: " . a:exit_code
780 endif
781
782 let s:current_job = v:null
783endfunction