mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-29 08:41:22 +00:00
server : speed up tests (#15836)
* server : speed up tests * clean up * restore timeout_seconds in some places * flake8 * explicit offline
This commit is contained in:
@@ -26,7 +26,7 @@ from re import RegexFlag
|
||||
import wget
|
||||
|
||||
|
||||
DEFAULT_HTTP_TIMEOUT = 30
|
||||
DEFAULT_HTTP_TIMEOUT = 60
|
||||
|
||||
|
||||
class ServerResponse:
|
||||
@@ -45,6 +45,7 @@ class ServerProcess:
|
||||
model_alias: str = "tinyllama-2"
|
||||
temperature: float = 0.8
|
||||
seed: int = 42
|
||||
offline: bool = False
|
||||
|
||||
# custom options
|
||||
model_alias: str | None = None
|
||||
@@ -118,6 +119,8 @@ class ServerProcess:
|
||||
"--seed",
|
||||
self.seed,
|
||||
]
|
||||
if self.offline:
|
||||
server_args.append("--offline")
|
||||
if self.model_file:
|
||||
server_args.extend(["--model", self.model_file])
|
||||
if self.model_url:
|
||||
@@ -392,6 +395,19 @@ server_instances: Set[ServerProcess] = set()
|
||||
|
||||
|
||||
class ServerPreset:
|
||||
@staticmethod
|
||||
def load_all() -> None:
|
||||
""" Load all server presets to ensure model files are cached. """
|
||||
servers: List[ServerProcess] = [
|
||||
method()
|
||||
for name, method in ServerPreset.__dict__.items()
|
||||
if callable(method) and name != "load_all"
|
||||
]
|
||||
for server in servers:
|
||||
server.offline = False
|
||||
server.start()
|
||||
server.stop()
|
||||
|
||||
@staticmethod
|
||||
def tinyllama2() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
@@ -408,6 +424,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def bert_bge_small() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
@@ -422,6 +439,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def bert_bge_small_with_fa() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
|
||||
server.model_alias = "bert-bge-small"
|
||||
@@ -437,6 +455,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def tinyllama_infill() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
|
||||
server.model_alias = "tinyllama-infill"
|
||||
@@ -451,6 +470,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def stories15m_moe() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/stories15M_MOE"
|
||||
server.model_hf_file = "stories15M_MOE-F16.gguf"
|
||||
server.model_alias = "stories15m-moe"
|
||||
@@ -465,6 +485,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def jina_reranker_tiny() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "jina-reranker-v1-tiny-en/ggml-model-f16.gguf"
|
||||
server.model_alias = "jina-reranker"
|
||||
@@ -478,6 +499,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def tinygemma3() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
# mmproj is already provided by HF registry API
|
||||
server.model_hf_repo = "ggml-org/tinygemma3-GGUF"
|
||||
server.model_hf_file = "tinygemma3-Q8_0.gguf"
|
||||
|
||||
Reference in New Issue
Block a user