mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	 553a5c3a9f
			
		
	
	553a5c3a9f
	
	
	
		
			
			RPC_CMD_SET_TENSOR always returns an empty response and we send this 4 times per token. We can improve TG speed if we don't wait for this empty response. The performance impact of this change depends on the network latency.
		
			
				
	
	
		
			34 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			34 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include "ggml.h"
 | |
| #include "ggml-backend.h"
 | |
| 
 | |
| #ifdef  __cplusplus
 | |
| extern "C" {
 | |
| #endif
 | |
| 
 | |
| #define RPC_PROTO_MAJOR_VERSION    2
 | |
| #define RPC_PROTO_MINOR_VERSION    0
 | |
| #define RPC_PROTO_PATCH_VERSION    0
 | |
| #define GGML_RPC_MAX_SERVERS       16
 | |
| 
 | |
| // backend API
 | |
| GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
 | |
| GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
 | |
| 
 | |
| GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
 | |
| 
 | |
| GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
 | |
| 
 | |
| GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
 | |
|                                                     const char * cache_dir,
 | |
|                                                     size_t free_mem, size_t total_mem);
 | |
| 
 | |
| GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
 | |
| 
 | |
| GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
 | |
| 
 | |
| #ifdef  __cplusplus
 | |
| }
 | |
| #endif
 |