mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : allow to generate multimodal embeddings (#4681)
This commit is contained in:
		 Karthik Sethuraman
					Karthik Sethuraman
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							82d6eab224
						
					
				
				
					commit
					b93edd22f5
				
			| @@ -166,7 +166,7 @@ node index.js | ||||
|  | ||||
|     `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) | ||||
|  | ||||
|     `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. | ||||
|     `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. | ||||
|  | ||||
|     *Result JSON:* | ||||
|  | ||||
| @@ -224,6 +224,8 @@ node index.js | ||||
|  | ||||
|     `content`: Set the text to process. | ||||
|  | ||||
|     `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. | ||||
|  | ||||
| -   **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. | ||||
|  | ||||
|     *Options:* | ||||
|   | ||||
| @@ -3077,7 +3077,17 @@ int main(int argc, char **argv) | ||||
|                 { | ||||
|                     prompt = ""; | ||||
|                 } | ||||
|                 const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); | ||||
|  | ||||
|                 json image_data; | ||||
|                 if (body.count("image_data") != 0) { | ||||
|                     image_data = body["image_data"]; | ||||
|                 } | ||||
|                 else | ||||
|                 { | ||||
|                     image_data = ""; | ||||
|                 } | ||||
|  | ||||
|                 const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1); | ||||
|                 task_result result = llama.next_result(task_id); | ||||
|                 return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); | ||||
|             }); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user