mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Merge commit from fork
This commit is contained in:
		| @@ -1,5 +1,9 @@ | |||||||
| ## Overview | ## Overview | ||||||
|  |  | ||||||
|  | > [!IMPORTANT] | ||||||
|  | > This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and | ||||||
|  | > insecure. **Never run the RPC server on an open network or in a sensitive environment!** | ||||||
|  |  | ||||||
| The `rpc-server` allows  running `ggml` backend on a remote host. | The `rpc-server` allows  running `ggml` backend on a remote host. | ||||||
| The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. | The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. | ||||||
| This can be used for distributed LLM inference with `llama.cpp` in the following way: | This can be used for distributed LLM inference with `llama.cpp` in the following way: | ||||||
|   | |||||||
| @@ -16,7 +16,7 @@ | |||||||
| #include <stdio.h> | #include <stdio.h> | ||||||
|  |  | ||||||
| struct rpc_server_params { | struct rpc_server_params { | ||||||
|     std::string host        = "0.0.0.0"; |     std::string host        = "127.0.0.1"; | ||||||
|     int         port        = 50052; |     int         port        = 50052; | ||||||
|     size_t      backend_mem = 0; |     size_t      backend_mem = 0; | ||||||
| }; | }; | ||||||
| @@ -114,6 +114,17 @@ int main(int argc, char * argv[]) { | |||||||
|         fprintf(stderr, "Invalid parameters\n"); |         fprintf(stderr, "Invalid parameters\n"); | ||||||
|         return 1; |         return 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     if (params.host != "127.0.0.1") { | ||||||
|  |         fprintf(stderr, "\n"); | ||||||
|  |         fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); | ||||||
|  |         fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str()); | ||||||
|  |         fprintf(stderr, "         Never expose the RPC server to an open network!\n"); | ||||||
|  |         fprintf(stderr, "         This is an experimental feature and is not secure!\n"); | ||||||
|  |         fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); | ||||||
|  |         fprintf(stderr, "\n"); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     ggml_backend_t backend = create_backend(); |     ggml_backend_t backend = create_backend(); | ||||||
|     if (!backend) { |     if (!backend) { | ||||||
|         fprintf(stderr, "Failed to create backend\n"); |         fprintf(stderr, "Failed to create backend\n"); | ||||||
|   | |||||||
| @@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por | |||||||
|         fprintf(stderr, "Failed to set SO_REUSEADDR\n"); |         fprintf(stderr, "Failed to set SO_REUSEADDR\n"); | ||||||
|         return nullptr; |         return nullptr; | ||||||
|     } |     } | ||||||
|  |     if (inet_addr(host) == INADDR_NONE) { | ||||||
|  |         fprintf(stderr, "Invalid host address: %s\n", host); | ||||||
|  |         return nullptr; | ||||||
|  |     } | ||||||
|     struct sockaddr_in serv_addr; |     struct sockaddr_in serv_addr; | ||||||
|     serv_addr.sin_family = AF_INET; |     serv_addr.sin_family = AF_INET; | ||||||
|     serv_addr.sin_addr.s_addr = inet_addr(host); |     serv_addr.sin_addr.s_addr = inet_addr(host); | ||||||
| @@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp | |||||||
|     if (result->buffer && buffers.find(result->buffer) == buffers.end()) { |     if (result->buffer && buffers.find(result->buffer) == buffers.end()) { | ||||||
|         return nullptr; |         return nullptr; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // require that the tensor data does not go beyond the buffer end | ||||||
|  |     uint64_t tensor_size = (uint64_t) ggml_nbytes(result); | ||||||
|  |     uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); | ||||||
|  |     uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); | ||||||
|  |     GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow | ||||||
|  |     GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); | ||||||
|  |  | ||||||
|     result->op = (ggml_op) tensor->op; |     result->op = (ggml_op) tensor->op; | ||||||
|     for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { |     for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { | ||||||
|         result->op_params[i] = tensor->op_params[i]; |         result->op_params[i] = tensor->op_params[i]; | ||||||
| @@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) { | |||||||
|     const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); |     const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); | ||||||
|     uint64_t offset; |     uint64_t offset; | ||||||
|     memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); |     memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); | ||||||
|     size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); |     const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); | ||||||
|  |  | ||||||
|     struct ggml_init_params params { |     struct ggml_init_params params { | ||||||
|         /*.mem_size   =*/ ggml_tensor_overhead(), |         /*.mem_size   =*/ ggml_tensor_overhead(), | ||||||
| @@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) { | |||||||
|         return false; |         return false; | ||||||
|     } |     } | ||||||
|     GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); |     GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); | ||||||
|  |  | ||||||
|  |     // sanitize tensor->data | ||||||
|  |     { | ||||||
|  |         const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); | ||||||
|  |         const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); | ||||||
|  |  | ||||||
|  |         if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { | ||||||
|  |             GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset); |     const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset); | ||||||
|     ggml_backend_tensor_set(tensor, data, offset, size); |     ggml_backend_tensor_set(tensor, data, offset, size); | ||||||
|     ggml_free(ctx); |     ggml_free(ctx); | ||||||
| @@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint | |||||||
|         return false; |         return false; | ||||||
|     } |     } | ||||||
|     GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); |     GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); | ||||||
|  |  | ||||||
|  |     // sanitize tensor->data | ||||||
|  |     { | ||||||
|  |         const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer); | ||||||
|  |         const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); | ||||||
|  |  | ||||||
|  |         if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { | ||||||
|  |             GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // output serialization format: | data (size bytes) | |     // output serialization format: | data (size bytes) | | ||||||
|     output.resize(size, 0); |     output.resize(size, 0); | ||||||
|     ggml_backend_tensor_get(tensor, output.data(), offset, size); |     ggml_backend_tensor_get(tensor, output.data(), offset, size); | ||||||
|   | |||||||
| @@ -3724,7 +3724,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( | |||||||
|         struct ggml_tensor  * view_src, |         struct ggml_tensor  * view_src, | ||||||
|         size_t                view_offs) { |         size_t                view_offs) { | ||||||
|  |  | ||||||
|     assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); |     GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT); | ||||||
|  |     GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); | ||||||
|  |  | ||||||
|     // find the base tensor and absolute offset |     // find the base tensor and absolute offset | ||||||
|     if (view_src != NULL && view_src->view_src != NULL) { |     if (view_src != NULL && view_src->view_src != NULL) { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov