#include "log.h" #if defined(_WIN32) int main(int argc, char ** argv) { LOG_ERR("Error: llama-run does not run on Windows (lack of linenoise.cpp).\n"); return 1; } #elif !defined(LLAMA_USE_CURL) int main(int argc, char ** argv) { LOG_ERR("Error: llama-run requires CURL support enabled.\n"); return 1; } #else # include "linenoise.cpp/linenoise.h" # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include # include // Global variables for process management static pid_t server_pid = -1; static std::atomic server_ready{ false }; static std::atomic should_exit{ false }; static std::atomic interrupt_response{ false }; // HTTP client for communicating with llama-server class HttpClient { private: CURL * curl; std::string response_data; std::string base_url; static size_t WriteCallback(void * contents, size_t size, size_t nmemb, std::string * userp) { if (interrupt_response.load()) { return 0; // Stop the transfer } size_t realsize = size * nmemb; userp->append((char *) contents, realsize); return realsize; } public: HttpClient(const std::string & host, int port) { curl = curl_easy_init(); base_url = "http://" + host + ":" + std::to_string(port); } ~HttpClient() { if (curl) { curl_easy_cleanup(curl); } } bool is_server_ready() { if (!curl) { return false; } response_data.clear(); // Try the models endpoint instead of health std::string url = base_url + "/v1/models"; curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_data); curl_easy_setopt(curl, CURLOPT_TIMEOUT, 2L); curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 1L); CURLcode res = curl_easy_perform(curl); return res == CURLE_OK && !response_data.empty(); } std::string chat_completion(const std::string & message) { if (!curl) { return "Error: HTTP client not initialized"; } response_data.clear(); std::string url = base_url + "/v1/chat/completions"; // Create simple JSON request string std::string escaped_message = escape_json_string(message); std::string request_str = R"({"model": "unknown", "messages": [{"role": "user", "content": ")" + escaped_message + R"("}], "stream": false})"; struct curl_slist * headers = nullptr; headers = curl_slist_append(headers, "Content-Type: application/json"); curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_POSTFIELDS, request_str.c_str()); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response_data); curl_easy_setopt(curl, CURLOPT_TIMEOUT, 120L); curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 10L); interrupt_response.store(false); CURLcode res = curl_easy_perform(curl); curl_slist_free_all(headers); if (res != CURLE_OK) { if (res == CURLE_OPERATION_TIMEDOUT) { return "Error: Request timed out"; } else if (res == CURLE_COULDNT_CONNECT) { return "Error: Could not connect to server"; } else { return "Error: " + std::string(curl_easy_strerror(res)); } } if (interrupt_response.load()) { return ""; // Empty response for interrupted requests } // Simple JSON parsing to extract content return extract_content_from_response(response_data); } private: std::string escape_json_string(const std::string & input) { std::string result; for (char c : input) { switch (c) { case '"': result += "\\\""; break; case '\\': result += "\\\\"; break; case '\n': result += "\\n"; break; case '\r': result += "\\r"; break; case '\t': result += "\\t"; break; default: result += c; break; } } return result; } std::string extract_content_from_response(const std::string & response) { // Simple extraction of content from JSON response // Look for "content":"..." pattern size_t content_pos = response.find("\"content\":\""); if (content_pos == std::string::npos) { return "Error: No content found in response"; } content_pos += 11; // Skip "content":" size_t end_pos = content_pos; // Find the end of the content string, handling escaped quotes while (end_pos < response.length()) { if (response[end_pos] == '"' && (end_pos == content_pos || response[end_pos - 1] != '\\')) { break; } end_pos++; } if (end_pos >= response.length()) { return "Error: Malformed response"; } std::string content = response.substr(content_pos, end_pos - content_pos); return unescape_json_string(content); } std::string unescape_json_string(const std::string & input) { std::string result; for (size_t i = 0; i < input.length(); ++i) { if (input[i] == '\\' && i + 1 < input.length()) { switch (input[i + 1]) { case '"': result += '"'; i++; break; case '\\': result += '\\'; i++; break; case 'n': result += '\n'; i++; break; case 'r': result += '\r'; i++; break; case 't': result += '\t'; i++; break; default: result += input[i]; break; } } else { result += input[i]; } } return result; } }; // Signal handlers static void sigint_handler(int sig) { (void) sig; // Set flag to interrupt response, but don't print here interrupt_response.store(true); } static int cleanup_and_exit(int exit_code) { if (server_pid > 0) { if (kill(server_pid, SIGTERM) == -1) { LOG_ERR("kill failed"); } if (waitpid(server_pid, nullptr, 0) == -1) { LOG_ERR("waitpid failed"); } } return exit_code; } static void sigterm_handler(int sig) { (void) sig; should_exit.store(true); } // Start llama-server process static bool start_server(const std::vector & args, int port) { server_pid = fork(); if (server_pid == -1) { perror("fork failed"); return false; } if (server_pid == 0) { // Child process - execute llama-server std::vector server_args_vec; server_args_vec.push_back("llama-server"); // Add custom port server_args_vec.push_back("--port"); server_args_vec.push_back(std::to_string(port)); // Add all original arguments except the program name for (size_t i = 1; i < args.size(); ++i) { // Skip any existing --port arguments to avoid conflicts if (args[i] == "--port") { i++; // Skip the port value too continue; } server_args_vec.push_back(args[i]); } // Convert to char* array for execvp std::vector server_args; for (const auto & arg : server_args_vec) { server_args.push_back(const_cast(arg.c_str())); } server_args.push_back(nullptr); // Try different paths for llama-server std::vector server_paths = { "./build/bin/llama-server", "./llama-server", "llama-server" }; for (const auto & path : server_paths) { execvp(path.c_str(), server_args.data()); } perror("Failed to execute llama-server"); exit(1); } return true; } // Wait for server to be ready, timeout is not excessive as we could be // downloading a model static bool wait_for_server(HttpClient & client, int max_wait_seconds = 3000) { std::cout << "Starting llama-server..." << std::flush; for (int i = 0; i < max_wait_seconds; ++i) { if (should_exit.load()) { return false; } if (client.is_server_ready()) { std::cout << " ready!\n"; server_ready.store(true); return true; } std::cout << "." << std::flush; std::this_thread::sleep_for(std::chrono::seconds(1)); } std::cout << " timeout!\n"; return false; } // Main interactive loop static int interactive_loop(HttpClient & client) { std::cout << "\nChat with the model (Ctrl-D to quit, Ctrl-C to interrupt response):\n"; const char * input; while ((input = linenoise("> ")) != nullptr && !should_exit.load()) { std::string user_input(input); linenoiseHistoryAdd(input); linenoiseFree(const_cast(input)); // linenoiseFree expects char* if (user_input.empty()) { continue; } // Reset interrupt flag before starting request interrupt_response.store(false); std::cout << std::flush; std::string response = client.chat_completion(user_input); if (interrupt_response.load()) { std::cout << "\n[Response interrupted - press Ctrl-D to quit]\n"; interrupt_response.store(false); } else { std::cout << response << "\n\n"; } } return 0; } static void print_usage(const char * program_name) { std::cout << "Usage: " << program_name << " [server-options]\n"; std::cout << "\nThis tool starts a llama-server process and provides an interactive chat interface.\n"; std::cout << "All options except --port are passed through to llama-server.\n"; std::cout << "\nCommon options:\n"; std::cout << " -h, --help Show this help\n"; std::cout << " -m, --model FNAME model path (default: `models/$filename` with filename from `--hf-file`\n"; std::cout << " or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)\n"; std::cout << " -hf, -hfr, --hf-repo /[:quant]\n"; std::cout << " Hugging Face model repository; quant is optional, case-insensitive,\n"; std::cout << " default to Q4_K_M, or falls back to the first file in the repo if\n"; std::cout << " Q4_K_M doesn't exist.\n"; std::cout << " mmproj is also downloaded automatically if available. to disable, add\n"; std::cout << " --no-mmproj\n"; std::cout << " example: unsloth/phi-4-GGUF:q4_k_m\n"; std::cout << " (default: unused)\n"; std::cout << " -c, --ctx-size N Context size\n"; std::cout << " -n, --predict N Number of tokens to predict\n"; std::cout << " -t, --threads N Number of threads\n"; std::cout << "\nFor all server options, run: llama-server --help\n"; } int main(int argc, char ** argv) { if (argc < 2) { print_usage("llama-run"); return 1; } // Check for help for (int i = 1; i < argc; ++i) { std::string arg = argv[i]; if (arg == "-h" || arg == "--help") { print_usage(argv[0]); return 0; } } curl_global_init(CURL_GLOBAL_DEFAULT); // Setup signal handlers signal(SIGINT, sigint_handler); signal(SIGTERM, sigterm_handler); // Convert args to vector std::vector args; for (int i = 0; i < argc; ++i) { args.push_back(argv[i]); } // Find a free port (start from 8080 and increment) int port = 8080; for (int i = 0; i < 100; ++i) { // Simple check if port is available by trying to bind int sock = socket(AF_INET, SOCK_STREAM, 0); if (sock >= 0) { struct sockaddr_in addr; addr.sin_family = AF_INET; addr.sin_port = htons(port); addr.sin_addr.s_addr = INADDR_ANY; if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) == 0) { close(sock); break; // Port is available } close(sock); } port++; } // Start server if (!start_server(args, port)) { std::cerr << "Failed to start llama-server\n"; return 1; } // Create HTTP client HttpClient client("127.0.0.1", port); // Wait for server to be ready if (!wait_for_server(client)) { std::cerr << "Server failed to start in time\n"; return cleanup_and_exit(1); } // Start interactive loop int result = interactive_loop(client); // Cleanup return cleanup_and_exit(result); } #endif