llmnpc - llama.cpp/tools/server/public

Path: llmnpc / llama.cpp / tools / server / public_legacy / completion.js (raw)
  1const paramDefaults = {
  2  stream: true,
  3  n_predict: 500,
  4  temperature: 0.2,
  5  stop: ["</s>"]
  6};
  7
  8let generation_settings = null;
  9
 10
 11// Completes the prompt as a generator. Recommended for most use cases.
 12//
 13// Example:
 14//
 15//    import { llama } from '/completion.js'
 16//
 17//    const request = llama("Tell me a joke", {n_predict: 800})
 18//    for await (const chunk of request) {
 19//      document.write(chunk.data.content)
 20//    }
 21//
 22export async function* llama(prompt, params = {}, config = {}) {
 23  let controller = config.controller;
 24  const api_url = config.api_url?.replace(/\/+$/, '') || "";
 25
 26  if (!controller) {
 27    controller = new AbortController();
 28  }
 29
 30  const completionParams = { ...paramDefaults, ...params, prompt };
 31
 32  const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, {
 33    method: 'POST',
 34    body: JSON.stringify(completionParams),
 35    headers: {
 36      'Connection': 'keep-alive',
 37      'Content-Type': 'application/json',
 38      'Accept': 'text/event-stream',
 39      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
 40    },
 41    signal: controller.signal,
 42  });
 43
 44  const reader = response.body.getReader();
 45  const decoder = new TextDecoder();
 46
 47  let content = "";
 48  let leftover = ""; // Buffer for partially read lines
 49
 50  try {
 51    let cont = true;
 52
 53    while (cont) {
 54      const result = await reader.read();
 55      if (result.done) {
 56        break;
 57      }
 58
 59      // Add any leftover data to the current chunk of data
 60      const text = leftover + decoder.decode(result.value);
 61
 62      // Check if the last character is a line break
 63      const endsWithLineBreak = text.endsWith('\n');
 64
 65      // Split the text into lines
 66      let lines = text.split('\n');
 67
 68      // If the text doesn't end with a line break, then the last line is incomplete
 69      // Store it in leftover to be added to the next chunk of data
 70      if (!endsWithLineBreak) {
 71        leftover = lines.pop();
 72      } else {
 73        leftover = ""; // Reset leftover if we have a line break at the end
 74      }
 75
 76      // Parse all sse events and add them to result
 77      const regex = /^(\S+):\s(.*)$/gm;
 78      for (const line of lines) {
 79        const match = regex.exec(line);
 80        if (match) {
 81          result[match[1]] = match[2];
 82          if (result.data === '[DONE]') {
 83            cont = false;
 84            break;
 85          }
 86
 87          // since we know this is llama.cpp, let's just decode the json in data
 88          if (result.data) {
 89            result.data = JSON.parse(result.data);
 90            content += result.data.content;
 91
 92            // yield
 93            yield result;
 94
 95            // if we got a stop token from server, we will break here
 96            if (result.data.stop) {
 97              if (result.data.generation_settings) {
 98                generation_settings = result.data.generation_settings;
 99              }
100              cont = false;
101              break;
102            }
103          }
104          if (result.error) {
105            try {
106              result.error = JSON.parse(result.error);
107              if (result.error.message.includes('slot unavailable')) {
108                // Throw an error to be caught by upstream callers
109                throw new Error('slot unavailable');
110              } else {
111                console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`);
112              }
113            } catch(e) {
114              console.error(`llama.cpp error ${result.error}`)
115            }
116          }
117        }
118      }
119    }
120  } catch (e) {
121    if (e.name !== 'AbortError') {
122      console.error("llama error: ", e);
123    }
124    throw e;
125  }
126  finally {
127    controller.abort();
128  }
129
130  return content;
131}
132
133// Call llama, return an event target that you can subscribe to
134//
135// Example:
136//
137//    import { llamaEventTarget } from '/completion.js'
138//
139//    const conn = llamaEventTarget(prompt)
140//    conn.addEventListener("message", (chunk) => {
141//      document.write(chunk.detail.content)
142//    })
143//
144export const llamaEventTarget = (prompt, params = {}, config = {}) => {
145  const eventTarget = new EventTarget();
146  (async () => {
147    let content = "";
148    for await (const chunk of llama(prompt, params, config)) {
149      if (chunk.data) {
150        content += chunk.data.content;
151        eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
152      }
153      if (chunk.data.generation_settings) {
154        eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
155      }
156      if (chunk.data.timings) {
157        eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
158      }
159    }
160    eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
161  })();
162  return eventTarget;
163}
164
165// Call llama, return a promise that resolves to the completed text. This does not support streaming
166//
167// Example:
168//
169//     llamaPromise(prompt).then((content) => {
170//       document.write(content)
171//     })
172//
173//     or
174//
175//     const content = await llamaPromise(prompt)
176//     document.write(content)
177//
178export const llamaPromise = (prompt, params = {}, config = {}) => {
179  return new Promise(async (resolve, reject) => {
180    let content = "";
181    try {
182      for await (const chunk of llama(prompt, params, config)) {
183        content += chunk.data.content;
184      }
185      resolve(content);
186    } catch (error) {
187      reject(error);
188    }
189  });
190};
191
192/**
193 * (deprecated)
194 */
195export const llamaComplete = async (params, controller, callback) => {
196  for await (const chunk of llama(params.prompt, params, { controller })) {
197    callback(chunk);
198  }
199}
200
201// Get the model info from the server. This is useful for getting the context window and so on.
202export const llamaModelInfo = async (config = {}) => {
203  if (!generation_settings) {
204    const api_url = config.api_url?.replace(/\/+$/, '') || "";
205    const props = await fetch(`${api_url}/props`).then(r => r.json());
206    generation_settings = props.default_generation_settings;
207  }
208  return generation_settings;
209}