1" LLM-based text completion using llama.cpp
  2"
  3" requires:
  4"
  5"   - neovim or vim
  6"   - curl
  7"   - llama.cpp server instance
  8"   - FIM-compatible model
  9"
 10" sample config:
 11"
 12"   - Tab       - accept the current suggestion
 13"   - Shift+Tab - accept just the first line of the suggestion
 14"   - Ctrl+F    - toggle FIM completion manually
 15"
 16" make symlink or copy this file to ~/.config/nvim/autoload/llama.vim
 17"
 18" start the llama.cpp server with a FIM-compatible model. for example:
 19"
 20"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
 21"
 22"   --batch-size [512, model max context]
 23"
 24"     adjust the batch size to control how much of the provided local context will be used during the inference
 25"     lower values will use smaller part of the context around the cursor, which will result in faster processing
 26"
 27"   --ubatch-size [64, 2048]
 28"
 29"     chunks the batch into smaller chunks for faster processing
 30"     depends on the specific hardware. use llama-bench to profile and determine the best size
 31"
 32"   --cache-reuse (ge:llama_config.n_predict, 1024]
 33"
 34"     this should be either 0 (disabled) or strictly larger than g:llama_config.n_predict
 35"     using non-zero value enables context reuse on the server side which dramatically improves the performance at
 36"     large contexts. a value of 256 should be good for all cases
 37"
 38" run this once to initialise llama.vim:
 39"
 40"   :call llama#init()
 41"
 42" more info: https://github.com/ggml-org/llama.cpp/pull/9787
 43"
 44
 45" colors (adjust to your liking)
 46highlight llama_hl_hint guifg=#ff772f ctermfg=202
 47highlight llama_hl_info guifg=#77ff2f ctermfg=119
 48
 49" general parameters:
 50"
 51"   endpoint:         llama.cpp server endpoint
 52"   n_prefix:         number of lines before the cursor location to include in the local prefix
 53"   n_suffix:         number of lines after  the cursor location to include in the local suffix
 54"   n_predict:        max number of tokens to predict
 55"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
 56"   t_max_predict_ms: max alloted time for the prediction
 57"   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
 58"   auto_fim:         trigger FIM completion automatically on cursor movement
 59"   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
 60"
 61" ring buffer of chunks, accumulated with time upon:
 62"
 63"  - completion request
 64"  - yank
 65"  - entering a buffer
 66"  - leaving a buffer
 67"  - writing a file
 68"
 69" parameters for the ring-buffer with extra context:
 70"
 71"   ring_n_chunks:    max number of chunks to pass as extra context to the server (0 to disable)
 72"   ring_chunk_size:  max size of the chunks (in number of lines)
 73"                     note: adjust these numbers so that you don't overrun your context
 74"                           at ring_n_chunks = 64 and ring_chunk_size = 64 you need ~32k context
 75"   ring_scope:       the range around the cursor position (in number of lines) for gathering chunks after FIM
 76"   ring_update_ms:   how often to process queued chunks in normal mode
 77"
 78let s:default_config = {
 79    \ 'endpoint':         'http://127.0.0.1:8012/infill',
 80    \ 'n_prefix':         256,
 81    \ 'n_suffix':         64,
 82    \ 'n_predict':        128,
 83    \ 't_max_prompt_ms':  500,
 84    \ 't_max_predict_ms': 3000,
 85    \ 'show_info':        2,
 86    \ 'auto_fim':         v:true,
 87    \ 'max_line_suffix':  8,
 88    \ 'ring_n_chunks':    64,
 89    \ 'ring_chunk_size':  64,
 90    \ 'ring_scope':       1024,
 91    \ 'ring_update_ms':   1000,
 92    \ }
 93
 94let g:llama_config = get(g:, 'llama_config', s:default_config)
 95
 96function! s:get_indent(str)
 97    let l:count = 0
 98    for i in range(len(a:str))
 99        if a:str[i] == "\t"
100            let l:count += &tabstop - 1
101        else
102            break
103        endif
104    endfor
105    return l:count
106endfunction
107
108function! s:rand(i0, i1) abort
109    return a:i0 + rand() % (a:i1 - a:i0 + 1)
110endfunction
111
112function! llama#init()
113    if !executable('curl')
114        echohl WarningMsg
115        echo 'llama.vim requires the "curl" command to be available'
116        echohl None
117        return
118    endif
119
120    let s:pos_x = 0 " cursor position upon start of completion
121    let s:pos_y = 0
122
123    let s:line_cur = ''
124
125    let s:line_cur_prefix = ''
126    let s:line_cur_suffix = ''
127
128    let s:ring_chunks = [] " current set of chunks used as extra context
129    let s:ring_queued = [] " chunks that are queued to be sent for processing
130    let s:ring_n_evict = 0
131
132    let s:hint_shown = v:false
133    let s:pos_y_pick = -9999 " last y where we picked a chunk
134    let s:pos_dx = 0
135    let s:content = []
136    let s:can_accept = v:false
137
138    let s:timer_fim = -1
139    let s:t_fim_start = reltime() " used to measure total FIM time
140    let s:t_last_move = reltime() " last time the cursor moved
141
142    let s:current_job = v:null
143
144    let s:ghost_text_nvim = exists('*nvim_buf_get_mark')
145    let s:ghost_text_vim = has('textprop')
146
147    if s:ghost_text_vim
148        let s:hlgroup_hint = 'llama_hl_hint'
149        let s:hlgroup_info = 'llama_hl_info'
150
151        if empty(prop_type_get(s:hlgroup_hint))
152            call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
153        endif
154        if empty(prop_type_get(s:hlgroup_info))
155            call prop_type_add(s:hlgroup_info, {'highlight': s:hlgroup_info})
156        endif
157    endif
158
159    augroup llama
160        autocmd!
161        autocmd InsertEnter     * inoremap <expr> <silent> <C-F> llama#fim_inline(v:false)
162        autocmd InsertLeavePre  * call llama#fim_cancel()
163
164        autocmd CursorMoved     * call s:on_move()
165        autocmd CursorMovedI    * call s:on_move()
166        autocmd CompleteChanged * call llama#fim_cancel()
167
168        if g:llama_config.auto_fim
169            autocmd CursorMovedI * call llama#fim(v:true)
170        endif
171
172        " gather chunks upon yanking
173        autocmd TextYankPost    * if v:event.operator ==# 'y' | call s:pick_chunk(v:event.regcontents, v:false, v:true) | endif
174
175        " gather chunks upon entering/leaving a buffer
176        autocmd BufEnter        * call timer_start(100, {-> s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)})
177        autocmd BufLeave        * call                      s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
178
179        " gather chunk upon saving the file
180        autocmd BufWritePost    * call s:pick_chunk(getline(max([1, line('.') - g:llama_config.ring_chunk_size/2]), min([line('.') + g:llama_config.ring_chunk_size/2, line('$')])), v:true, v:true)
181    augroup END
182
183    silent! call llama#fim_cancel()
184
185    " init background update of the ring buffer
186    if g:llama_config.ring_n_chunks > 0
187        call s:ring_update()
188    endif
189endfunction
190
191" compute how similar two chunks of text are
192" 0 - no similarity, 1 - high similarity
193" TODO: figure out something better
194function! s:chunk_sim(c0, c1)
195    let l:lines0 = len(a:c0)
196    let l:lines1 = len(a:c1)
197
198    let l:common = 0
199
200    for l:line0 in a:c0
201        for l:line1 in a:c1
202            if l:line0 == l:line1
203                let l:common += 1
204                break
205            endif
206        endfor
207    endfor
208
209    return 2.0 * l:common / (l:lines0 + l:lines1)
210endfunction
211
212" pick a random chunk of size g:llama_config.ring_chunk_size from the provided text and queue it for processing
213"
214" no_mod   - do not pick chunks from buffers with pending changes
215" do_evict - evict chunks that are very similar to the new one
216"
217function! s:pick_chunk(text, no_mod, do_evict)
218    " do not pick chunks from buffers with pending changes or buffers that are not files
219    if a:no_mod && (getbufvar(bufnr('%'), '&modified') || !buflisted(bufnr('%')) || !filereadable(expand('%')))
220        return
221    endif
222
223    " if the extra context option is disabled - do nothing
224    if g:llama_config.ring_n_chunks <= 0
225        return
226    endif
227
228    " don't pick very small chunks
229    if len(a:text) < 3
230        return
231    endif
232
233    if len(a:text) + 1 < g:llama_config.ring_chunk_size
234        let l:chunk = a:text
235    else
236        let l:l0 = s:rand(0, max([0, len(a:text) - g:llama_config.ring_chunk_size/2]))
237        let l:l1 = min([l:l0 + g:llama_config.ring_chunk_size/2, len(a:text)])
238
239        let l:chunk = a:text[l:l0:l:l1]
240    endif
241
242    let l:chunk_str = join(l:chunk, "\n") . "\n"
243
244    " check if this chunk is already added
245    let l:exist = v:false
246
247    for i in range(len(s:ring_chunks))
248        if s:ring_chunks[i].data == l:chunk
249            let l:exist = v:true
250            break
251        endif
252    endfor
253
254    for i in range(len(s:ring_queued))
255        if s:ring_queued[i].data == l:chunk
256            let l:exist = v:true
257            break
258        endif
259    endfor
260
261    if l:exist
262        return
263    endif
264
265    " evict queued chunks that are very similar to the new one
266    for i in range(len(s:ring_queued) - 1, 0, -1)
267        if s:chunk_sim(s:ring_queued[i].data, l:chunk) > 0.9
268            if a:do_evict
269                call remove(s:ring_queued, i)
270                let s:ring_n_evict += 1
271            else
272                return
273            endif
274        endif
275    endfor
276
277    " also from s:ring_chunks
278    for i in range(len(s:ring_chunks) - 1, 0, -1)
279        if s:chunk_sim(s:ring_chunks[i].data, l:chunk) > 0.9
280            if a:do_evict
281                call remove(s:ring_chunks, i)
282                let s:ring_n_evict += 1
283            else
284                return
285            endif
286        endif
287    endfor
288
289    " TODO: become parameter ?
290    if len(s:ring_queued) == 16
291        call remove(s:ring_queued, 0)
292    endif
293
294    call add(s:ring_queued, {'data': l:chunk, 'str': l:chunk_str, 'time': reltime(), 'filename': expand('%')})
295
296    "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
297endfunction
298
299" picks a queued chunk, sends it for processing and adds it to s:ring_chunks
300" called every g:llama_config.ring_update_ms
301function! s:ring_update()
302    call timer_start(g:llama_config.ring_update_ms, {-> s:ring_update()})
303
304    " update only if in normal mode or if the cursor hasn't moved for a while
305    if mode() !=# 'n' && reltimefloat(reltime(s:t_last_move)) < 3.0
306        return
307    endif
308
309    if len(s:ring_queued) == 0
310        return
311    endif
312
313    " move the first queued chunk to the ring buffer
314    if len(s:ring_chunks) == g:llama_config.ring_n_chunks
315        call remove(s:ring_chunks, 0)
316    endif
317
318    call add(s:ring_chunks, remove(s:ring_queued, 0))
319
320    "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
321
322    " send asynchronous job with the new extra context so that it is ready for the next FIM
323    let l:extra_context = []
324    for l:chunk in s:ring_chunks
325        call add(l:extra_context, {
326            \ 'text':     l:chunk.str,
327            \ 'time':     l:chunk.time,
328            \ 'filename': l:chunk.filename
329            \ })
330    endfor
331
332    " no samplers needed here
333    let l:request = json_encode({
334        \ 'input_prefix':     "",
335        \ 'input_suffix':     "",
336        \ 'input_extra':      l:extra_context,
337        \ 'prompt':           "",
338        \ 'n_predict':        1,
339        \ 'temperature':      0.0,
340        \ 'stream':           v:false,
341        \ 'samplers':         ["temperature"],
342        \ 'cache_prompt':     v:true,
343        \ 't_max_prompt_ms':  1,
344        \ 't_max_predict_ms': 1
345        \ })
346
347    let l:curl_command = [
348        \ "curl",
349        \ "--silent",
350        \ "--no-buffer",
351        \ "--request", "POST",
352        \ "--url", g:llama_config.endpoint,
353        \ "--header", "Content-Type: application/json",
354        \ "--data", l:request
355        \ ]
356
357    " no callbacks because we don't need to process the response
358    if s:ghost_text_nvim
359        call jobstart(l:curl_command, {})
360    elseif s:ghost_text_vim
361        call job_start(l:curl_command, {})
362    endif
363endfunction
364
365" necessary for 'inoremap <expr>'
366function! llama#fim_inline(is_auto) abort
367    call llama#fim(a:is_auto)
368    return ''
369endfunction
370
371" the main FIM call
372" takes local context around the cursor and sends it together with the extra context to the server for completion
373function! llama#fim(is_auto) abort
374    " we already have a suggestion for the current cursor position
375    if s:hint_shown && !a:is_auto
376        call llama#fim_cancel()
377        return
378    endif
379
380    call llama#fim_cancel()
381
382    " avoid sending repeated requests too fast
383    if reltimefloat(reltime(s:t_fim_start)) < 0.6
384        if s:timer_fim != -1
385            call timer_stop(s:timer_fim)
386            let s:timer_fim = -1
387        endif
388
389        let s:t_fim_start = reltime()
390        let s:timer_fim = timer_start(600, {-> llama#fim(v:true)})
391        return
392    endif
393
394    let s:t_fim_start = reltime()
395
396    let s:content = []
397    let s:can_accept = v:false
398
399    let s:pos_x = col('.') - 1
400    let s:pos_y = line('.')
401    let l:max_y = line('$')
402
403    let l:lines_prefix = getline(max([1, s:pos_y - g:llama_config.n_prefix]), s:pos_y - 1)
404    let l:lines_suffix = getline(s:pos_y + 1, min([l:max_y, s:pos_y + g:llama_config.n_suffix]))
405
406    let s:line_cur = getline('.')
407
408    let s:line_cur_prefix = strpart(s:line_cur, 0, s:pos_x)
409    let s:line_cur_suffix = strpart(s:line_cur, s:pos_x)
410
411    if a:is_auto && len(s:line_cur_suffix) > g:llama_config.max_line_suffix
412        return
413    endif
414
415    let l:prefix = ""
416        \ . join(l:lines_prefix, "\n")
417        \ . "\n"
418
419    let l:prompt = ""
420        \ . s:line_cur_prefix
421
422    let l:suffix = ""
423        \ . s:line_cur_suffix
424        \ . "\n"
425        \ . join(l:lines_suffix, "\n")
426        \ . "\n"
427
428    " prepare the extra context data
429    let l:extra_context = []
430    for l:chunk in s:ring_chunks
431        call add(l:extra_context, {
432            \ 'text':     l:chunk.str,
433            \ 'time':     l:chunk.time,
434            \ 'filename': l:chunk.filename
435            \ })
436    endfor
437
438    " the indentation of the current line
439    let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
440
441    let l:request = json_encode({
442        \ 'input_prefix':     l:prefix,
443        \ 'input_suffix':     l:suffix,
444        \ 'input_extra':      l:extra_context,
445        \ 'prompt':           l:prompt,
446        \ 'n_predict':        g:llama_config.n_predict,
447        \ 'n_indent':         l:indent,
448        \ 'top_k':            40,
449        \ 'top_p':            0.99,
450        \ 'stream':           v:false,
451        \ 'samplers':         ["top_k", "top_p", "infill"],
452        \ 'cache_prompt':     v:true,
453        \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
454        \ 't_max_predict_ms': g:llama_config.t_max_predict_ms
455        \ })
456
457    let l:curl_command = [
458        \ "curl",
459        \ "--silent",
460        \ "--no-buffer",
461        \ "--request", "POST",
462        \ "--url", g:llama_config.endpoint,
463        \ "--header", "Content-Type: application/json",
464        \ "--data", l:request
465        \ ]
466
467    if s:current_job != v:null
468        if s:ghost_text_nvim
469            call jobstop(s:current_job)
470        elseif s:ghost_text_vim
471            call job_stop(s:current_job)
472        endif
473    endif
474
475    " send the request asynchronously
476    if s:ghost_text_nvim
477        let s:current_job = jobstart(l:curl_command, {
478            \ 'on_stdout': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
479            \ 'on_exit':   function('s:fim_on_exit'),
480            \ 'stdout_buffered': v:true
481            \ })
482    elseif s:ghost_text_vim
483        let s:current_job = job_start(l:curl_command, {
484            \ 'out_cb': function('s:fim_on_stdout', [s:pos_x, s:pos_y, a:is_auto]),
485            \ 'exit_cb':   function('s:fim_on_exit')
486            \ })
487    endif
488
489    " TODO: per-file location
490    let l:delta_y = abs(s:pos_y - s:pos_y_pick)
491
492    " gather some extra context nearby and process it in the background
493    " only gather chunks if the cursor has moved a lot
494    " TODO: something more clever? reranking?
495    if a:is_auto && l:delta_y > 32
496        " expand the prefix even further
497        call s:pick_chunk(getline(max([1,       s:pos_y - g:llama_config.ring_scope]), max([1,       s:pos_y - g:llama_config.n_prefix])), v:false, v:false)
498
499        " pick a suffix chunk
500        call s:pick_chunk(getline(min([l:max_y, s:pos_y + g:llama_config.n_suffix]),   min([l:max_y, s:pos_y + g:llama_config.n_suffix + g:llama_config.ring_chunk_size])), v:false, v:false)
501
502        let s:pos_y_pick = s:pos_y
503    endif
504endfunction
505
506" if first_line == v:true accept only the first line of the response
507function! llama#fim_accept(first_line)
508    " insert the suggestion at the cursor location
509    if s:can_accept && len(s:content) > 0
510        call setline(s:pos_y, s:line_cur[:(s:pos_x - 1)] . s:content[0])
511        if len(s:content) > 1
512            if !a:first_line
513                call append(s:pos_y, s:content[1:-1])
514            endif
515        endif
516
517        " move the cursor to the end of the accepted text
518        if !a:first_line && len(s:content) > 1
519            call cursor(s:pos_y + len(s:content) - 1, s:pos_x + s:pos_dx + 1)
520        else
521            call cursor(s:pos_y, s:pos_x + len(s:content[0]))
522        endif
523    endif
524
525    call llama#fim_cancel()
526endfunction
527
528function! llama#fim_cancel()
529    let s:hint_shown = v:false
530
531    " clear the virtual text
532    let l:bufnr = bufnr('%')
533
534    if s:ghost_text_nvim
535        let l:id_vt_fim = nvim_create_namespace('vt_fim')
536        call nvim_buf_clear_namespace(l:bufnr, l:id_vt_fim,  0, -1)
537    elseif s:ghost_text_vim
538        call prop_remove({'type': s:hlgroup_hint, 'all': v:true})
539        call prop_remove({'type': s:hlgroup_info, 'all': v:true})
540    endif
541
542    " remove the mappings
543    silent! iunmap <buffer> <Tab>
544    silent! iunmap <buffer> <S-Tab>
545    silent! iunmap <buffer> <Esc>
546endfunction
547
548function! s:on_move()
549    let s:t_last_move = reltime()
550
551    call llama#fim_cancel()
552endfunction
553
554" callback that processes the FIM result from the server and displays the suggestion
555function! s:fim_on_stdout(pos_x, pos_y, is_auto, job_id, data, event = v:null)
556    if s:ghost_text_nvim
557        let l:raw = join(a:data, "\n")
558    elseif s:ghost_text_vim
559        let l:raw = a:data
560    endif
561
562    if len(l:raw) == 0
563        return
564    endif
565
566    if a:pos_x != col('.') - 1 || a:pos_y != line('.')
567        return
568    endif
569
570    " show the suggestion only in insert mode
571    if mode() !=# 'i'
572        return
573    endif
574
575    let s:pos_x = a:pos_x
576    let s:pos_y = a:pos_y
577
578    let s:can_accept = v:true
579    let l:has_info   = v:false
580
581    if s:can_accept && v:shell_error
582        if !a:is_auto
583            call add(s:content, "<| curl error: is the server on? |>")
584        endif
585        let s:can_accept = v:false
586    endif
587
588    let l:n_prompt    = 0
589    let l:t_prompt_ms = 1.0
590    let l:s_prompt    = 0
591
592    let l:n_predict    = 0
593    let l:t_predict_ms = 1.0
594    let l:s_predict    = 0
595
596    " get the generated suggestion
597    if s:can_accept
598        let l:response = json_decode(l:raw)
599
600        for l:part in split(get(l:response, 'content', ''), "\n", 1)
601            call add(s:content, l:part)
602        endfor
603
604        " remove trailing new lines
605        while len(s:content) > 0 && s:content[-1] == ""
606            call remove(s:content, -1)
607        endwhile
608
609        let l:generation_settings = get(l:response, 'generation_settings', {})
610        let l:n_ctx = get(l:generation_settings, 'n_ctx', 0)
611
612        let l:n_cached  = get(l:response, 'tokens_cached', 0)
613        let l:truncated = get(l:response, 'truncated', v:false)
614
615        " if response.timings is available
616        if len(get(l:response, 'timings', {})) > 0
617            let l:has_info = v:true
618            let l:timings  = get(l:response, 'timings', {})
619
620            let l:n_prompt    = get(l:timings, 'prompt_n', 0)
621            let l:t_prompt_ms = get(l:timings, 'prompt_ms', 1)
622            let l:s_prompt    = get(l:timings, 'prompt_per_second', 0)
623
624            let l:n_predict    = get(l:timings, 'predicted_n', 0)
625            let l:t_predict_ms = get(l:timings, 'predicted_ms', 1)
626            let l:s_predict    = get(l:timings, 'predicted_per_second', 0)
627        endif
628    endif
629
630    if len(s:content) == 0
631        call add(s:content, "")
632        let s:can_accept = v:false
633    endif
634
635    if len(s:content) == 0
636        return
637    endif
638
639    " NOTE: the following is logic for discarding predictions that repeat existing text
640    "       the code is quite ugly and there is very likely a simpler and more canonical way to implement this
641    "
642    "       still, I wonder if there is some better way that avoids having to do these special hacks?
643    "       on one hand, the LLM 'sees' the contents of the file before we start editing, so it is normal that it would
644    "       start generating whatever we have given it via the extra context. but on the other hand, it's not very
645    "       helpful to re-generate the same code that is already there
646
647    " truncate the suggestion if the first line is empty
648    if len(s:content) == 1 && s:content[0] == ""
649        let s:content = [""]
650    endif
651
652    " ... and the next lines are repeated
653    if len(s:content) > 1 && s:content[0] == "" && s:content[1:] == getline(s:pos_y + 1, s:pos_y + len(s:content) - 1)
654        let s:content = [""]
655    endif
656
657    " truncate the suggestion if it repeats the suffix
658    if len(s:content) == 1 && s:content[0] == s:line_cur_suffix
659        let s:content = [""]
660    endif
661
662    " find the first non-empty line (strip whitespace)
663    let l:cmp_y = s:pos_y + 1
664    while l:cmp_y < line('$') && getline(l:cmp_y) =~? '^\s*$'
665        let l:cmp_y += 1
666    endwhile
667
668    if (s:line_cur_prefix . s:content[0]) == getline(l:cmp_y)
669        " truncate the suggestion if it repeats the next line
670        if len(s:content) == 1
671            let s:content = [""]
672        endif
673
674        " ... or if the second line of the suggestion is the prefix of line l:cmp_y + 1
675        if len(s:content) == 2 && s:content[-1] == getline(l:cmp_y + 1)[:len(s:content[-1]) - 1]
676            let s:content = [""]
677        endif
678
679        " ... or if the middle chunk of lines of the suggestion is the same as [l:cmp_y + 1, l:cmp_y + len(s:content) - 1)
680        if len(s:content) > 2 && join(s:content[1:-1], "\n") == join(getline(l:cmp_y + 1, l:cmp_y + len(s:content) - 1), "\n")
681            let s:content = [""]
682        endif
683    endif
684
685    " keep only lines that have the same or larger whitespace prefix as s:line_cur_prefix
686    "let l:indent = strlen(matchstr(s:line_cur_prefix, '^\s*'))
687    "for i in range(1, len(s:content) - 1)
688    "    if strlen(matchstr(s:content[i], '^\s*')) < l:indent
689    "        let s:content = s:content[:i - 1]
690    "        break
691    "    endif
692    "endfor
693
694    let s:pos_dx = len(s:content[-1])
695
696    let s:content[-1] .= s:line_cur_suffix
697
698    call llama#fim_cancel()
699
700    " display virtual text with the suggestion
701    let l:bufnr = bufnr('%')
702
703    if s:ghost_text_nvim
704        let l:id_vt_fim = nvim_create_namespace('vt_fim')
705    endif
706
707    " construct the info message
708    if g:llama_config.show_info > 0 && l:has_info
709        let l:prefix = '   '
710
711        if l:truncated
712            let l:info = printf("%s | WARNING: the context is full: %d / %d, increase the server context size or reduce g:llama_config.ring_n_chunks",
713                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
714                \ l:n_cached, l:n_ctx
715                \ )
716        else
717            let l:info = printf("%s | c: %d / %d, r: %d / %d, e: %d, q: %d / 16 | p: %d (%.2f ms, %.2f t/s) | g: %d (%.2f ms, %.2f t/s) | t: %.2f ms",
718                \ g:llama_config.show_info == 2 ? l:prefix : 'llama.vim',
719                \ l:n_cached,  l:n_ctx, len(s:ring_chunks), g:llama_config.ring_n_chunks, s:ring_n_evict, len(s:ring_queued),
720                \ l:n_prompt,  l:t_prompt_ms,  l:s_prompt,
721                \ l:n_predict, l:t_predict_ms, l:s_predict,
722                \ 1000.0 * reltimefloat(reltime(s:t_fim_start))
723                \ )
724        endif
725
726        if g:llama_config.show_info == 1
727            " display the info in the statusline
728            let &statusline = l:info
729            let l:info = ''
730        endif
731    endif
732
733    " display the suggestion and append the info to the end of the first line
734    if s:ghost_text_nvim
735        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, s:pos_x - 1, {
736            \ 'virt_text': [[s:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
737            \ 'virt_text_win_col': virtcol('.') - 1
738            \ })
739
740        call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, s:pos_y - 1, 0, {
741            \ 'virt_lines': map(s:content[1:], {idx, val -> [[val, 'llama_hl_hint']]}),
742            \ 'virt_text_win_col': virtcol('.')
743            \ })
744    elseif s:ghost_text_vim
745        let l:new_suffix = s:content[0]
746        if !empty(l:new_suffix)
747            call prop_add(s:pos_y, s:pos_x + 1, {
748                        \ 'type': s:hlgroup_hint,
749                        \ 'text': l:new_suffix
750                        \ })
751        endif
752        for line in s:content[1:]
753            call prop_add(s:pos_y, 0, {
754                        \ 'type': s:hlgroup_hint,
755                        \ 'text': line,
756                        \ 'text_padding_left': s:get_indent(line),
757                        \ 'text_align': 'below'
758                        \ })
759        endfor
760        if !empty(l:info)
761            call prop_add(s:pos_y, 0, {
762                        \ 'type': s:hlgroup_info,
763                        \ 'text': l:info,
764                        \ 'text_padding_left': col('$'),
765                        \ 'text_wrap': 'truncate'
766                        \ })
767        endif
768    endif
769
770    " setup accept shortcuts
771    inoremap <buffer> <Tab>   <C-O>:call llama#fim_accept(v:false)<CR>
772    inoremap <buffer> <S-Tab> <C-O>:call llama#fim_accept(v:true)<CR>
773
774    let s:hint_shown = v:true
775endfunction
776
777function! s:fim_on_exit(job_id, exit_code, event = v:null)
778    if a:exit_code != 0
779        echom "Job failed with exit code: " . a:exit_code
780    endif
781
782    let s:current_job = v:null
783endfunction