mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			203 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
			
		
		
	
	
			203 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			JavaScript
		
	
	
	
	
	
| const paramDefaults = {
 | |
|   stream: true,
 | |
|   n_predict: 500,
 | |
|   temperature: 0.2,
 | |
|   stop: ["</s>"]
 | |
| };
 | |
| 
 | |
| let generation_settings = null;
 | |
| 
 | |
| 
 | |
| // Completes the prompt as a generator. Recommended for most use cases.
 | |
| //
 | |
| // Example:
 | |
| //
 | |
| //    import { llama } from '/completion.js'
 | |
| //
 | |
| //    const request = llama("Tell me a joke", {n_predict: 800})
 | |
| //    for await (const chunk of request) {
 | |
| //      document.write(chunk.data.content)
 | |
| //    }
 | |
| //
 | |
| export async function* llama(prompt, params = {}, config = {}) {
 | |
|   let controller = config.controller;
 | |
| 
 | |
|   if (!controller) {
 | |
|     controller = new AbortController();
 | |
|   }
 | |
| 
 | |
|   const completionParams = { ...paramDefaults, ...params, prompt };
 | |
| 
 | |
|   const response = await fetch("/completion", {
 | |
|     method: 'POST',
 | |
|     body: JSON.stringify(completionParams),
 | |
|     headers: {
 | |
|       'Connection': 'keep-alive',
 | |
|       'Content-Type': 'application/json',
 | |
|       'Accept': 'text/event-stream',
 | |
|       ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
 | |
|     },
 | |
|     signal: controller.signal,
 | |
|   });
 | |
| 
 | |
|   const reader = response.body.getReader();
 | |
|   const decoder = new TextDecoder();
 | |
| 
 | |
|   let content = "";
 | |
|   let leftover = ""; // Buffer for partially read lines
 | |
| 
 | |
|   try {
 | |
|     let cont = true;
 | |
| 
 | |
|     while (cont) {
 | |
|       const result = await reader.read();
 | |
|       if (result.done) {
 | |
|         break;
 | |
|       }
 | |
| 
 | |
|       // Add any leftover data to the current chunk of data
 | |
|       const text = leftover + decoder.decode(result.value);
 | |
| 
 | |
|       // Check if the last character is a line break
 | |
|       const endsWithLineBreak = text.endsWith('\n');
 | |
| 
 | |
|       // Split the text into lines
 | |
|       let lines = text.split('\n');
 | |
| 
 | |
|       // If the text doesn't end with a line break, then the last line is incomplete
 | |
|       // Store it in leftover to be added to the next chunk of data
 | |
|       if (!endsWithLineBreak) {
 | |
|         leftover = lines.pop();
 | |
|       } else {
 | |
|         leftover = ""; // Reset leftover if we have a line break at the end
 | |
|       }
 | |
| 
 | |
|       // Parse all sse events and add them to result
 | |
|       const regex = /^(\S+):\s(.*)$/gm;
 | |
|       for (const line of lines) {
 | |
|         const match = regex.exec(line);
 | |
|         if (match) {
 | |
|           result[match[1]] = match[2]
 | |
|           // since we know this is llama.cpp, let's just decode the json in data
 | |
|           if (result.data) {
 | |
|             result.data = JSON.parse(result.data);
 | |
|             content += result.data.content;
 | |
| 
 | |
|             // yield
 | |
|             yield result;
 | |
| 
 | |
|             // if we got a stop token from server, we will break here
 | |
|             if (result.data.stop) {
 | |
|               if (result.data.generation_settings) {
 | |
|                 generation_settings = result.data.generation_settings;
 | |
|               }
 | |
|               cont = false;
 | |
|               break;
 | |
|             }
 | |
|           }
 | |
|           if (result.error) {
 | |
|             result.error = JSON.parse(result.error);
 | |
|             if (result.error.content.includes('slot unavailable')) {
 | |
|               // Throw an error to be caught by upstream callers
 | |
|               throw new Error('slot unavailable');
 | |
|             } else {
 | |
|               console.error(`llama.cpp error: ${result.error.content}`);
 | |
|             }
 | |
|           }
 | |
|           if (result.error) {
 | |
|             result.error = JSON.parse(result.error);
 | |
|             console.error(`llama.cpp error: ${result.error.content}`);
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   } catch (e) {
 | |
|     if (e.name !== 'AbortError') {
 | |
|       console.error("llama error: ", e);
 | |
|     }
 | |
|     throw e;
 | |
|   }
 | |
|   finally {
 | |
|     controller.abort();
 | |
|   }
 | |
| 
 | |
|   return content;
 | |
| }
 | |
| 
 | |
| // Call llama, return an event target that you can subscribe to
 | |
| //
 | |
| // Example:
 | |
| //
 | |
| //    import { llamaEventTarget } from '/completion.js'
 | |
| //
 | |
| //    const conn = llamaEventTarget(prompt)
 | |
| //    conn.addEventListener("message", (chunk) => {
 | |
| //      document.write(chunk.detail.content)
 | |
| //    })
 | |
| //
 | |
| export const llamaEventTarget = (prompt, params = {}, config = {}) => {
 | |
|   const eventTarget = new EventTarget();
 | |
|   (async () => {
 | |
|     let content = "";
 | |
|     for await (const chunk of llama(prompt, params, config)) {
 | |
|       if (chunk.data) {
 | |
|         content += chunk.data.content;
 | |
|         eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
 | |
|       }
 | |
|       if (chunk.data.generation_settings) {
 | |
|         eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
 | |
|       }
 | |
|       if (chunk.data.timings) {
 | |
|         eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
 | |
|       }
 | |
|     }
 | |
|     eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
 | |
|   })();
 | |
|   return eventTarget;
 | |
| }
 | |
| 
 | |
| // Call llama, return a promise that resolves to the completed text. This does not support streaming
 | |
| //
 | |
| // Example:
 | |
| //
 | |
| //     llamaPromise(prompt).then((content) => {
 | |
| //       document.write(content)
 | |
| //     })
 | |
| //
 | |
| //     or
 | |
| //
 | |
| //     const content = await llamaPromise(prompt)
 | |
| //     document.write(content)
 | |
| //
 | |
| export const llamaPromise = (prompt, params = {}, config = {}) => {
 | |
|   return new Promise(async (resolve, reject) => {
 | |
|     let content = "";
 | |
|     try {
 | |
|       for await (const chunk of llama(prompt, params, config)) {
 | |
|         content += chunk.data.content;
 | |
|       }
 | |
|       resolve(content);
 | |
|     } catch (error) {
 | |
|       reject(error);
 | |
|     }
 | |
|   });
 | |
| };
 | |
| 
 | |
| /**
 | |
|  * (deprecated)
 | |
|  */
 | |
| export const llamaComplete = async (params, controller, callback) => {
 | |
|   for await (const chunk of llama(params.prompt, params, { controller })) {
 | |
|     callback(chunk);
 | |
|   }
 | |
| }
 | |
| 
 | |
| // Get the model info from the server. This is useful for getting the context window and so on.
 | |
| export const llamaModelInfo = async () => {
 | |
|   if (!generation_settings) {
 | |
|     const props = await fetch("/props").then(r => r.json());
 | |
|     generation_settings = props.default_generation_settings;
 | |
|   }
 | |
|   return generation_settings;
 | |
| }
 | 
