mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ggml : automatic selection of best CPU backend (#10606)
* ggml : automatic selection of best CPU backend * amx : minor opt * add GGML_AVX_VNNI to enable avx-vnni, fix checks
This commit is contained in:
		| @@ -3,22 +3,34 @@ ARG UBUNTU_VERSION=22.04 | |||||||
| FROM ubuntu:$UBUNTU_VERSION AS build | FROM ubuntu:$UBUNTU_VERSION AS build | ||||||
|  |  | ||||||
| RUN apt-get update && \ | RUN apt-get update && \ | ||||||
|     apt-get install -y build-essential git libcurl4-openssl-dev |     apt-get install -y build-essential git cmake libcurl4-openssl-dev | ||||||
|  |  | ||||||
| WORKDIR /app | WORKDIR /app | ||||||
|  |  | ||||||
| COPY . . | COPY . . | ||||||
|  |  | ||||||
| ENV LLAMA_CURL=1 |  | ||||||
|  |  | ||||||
| RUN make -j$(nproc) llama-server | RUN \ | ||||||
|  |     # Build multiple versions of the CPU backend | ||||||
|  |     scripts/build-cpu.sh avx         -DGGML_AVX=ON -DGGML_AVX2=OFF && \ | ||||||
|  |     scripts/build-cpu.sh avx2        -DGGML_AVX=ON -DGGML_AVX2=ON && \ | ||||||
|  |     scripts/build-cpu.sh avx512      -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \ | ||||||
|  |     scripts/build-cpu.sh amx         -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \ | ||||||
|  |     # Build llama-server | ||||||
|  |     cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ | ||||||
|  |     cmake --build build --target llama-server -j $(nproc) && \ | ||||||
|  |     # Copy the built libraries to /app/lib | ||||||
|  |     mkdir -p /app/lib && \ | ||||||
|  |     mv libggml-cpu* /app/lib/ && \ | ||||||
|  |     find build -name "*.so" -exec cp {} /app/lib/ \; | ||||||
|  |  | ||||||
| FROM ubuntu:$UBUNTU_VERSION AS runtime | FROM ubuntu:$UBUNTU_VERSION AS runtime | ||||||
|  |  | ||||||
| RUN apt-get update && \ | RUN apt-get update && \ | ||||||
|     apt-get install -y libcurl4-openssl-dev libgomp1 curl |     apt-get install -y libcurl4-openssl-dev libgomp1 curl | ||||||
|  |  | ||||||
| COPY --from=build /app/llama-server /llama-server | COPY --from=build /app/build/bin/llama-server /llama-server | ||||||
|  | COPY --from=build /app/lib/ / | ||||||
|  |  | ||||||
| ENV LC_ALL=C.utf8 | ENV LC_ALL=C.utf8 | ||||||
| # Must be set to 0.0.0.0 so it can listen to requests from host machine | # Must be set to 0.0.0.0 so it can listen to requests from host machine | ||||||
|   | |||||||
| @@ -96,10 +96,6 @@ if (NOT DEFINED GGML_LLAMAFILE) | |||||||
|     set(GGML_LLAMAFILE_DEFAULT ON) |     set(GGML_LLAMAFILE_DEFAULT ON) | ||||||
| endif() | endif() | ||||||
|  |  | ||||||
| if (NOT DEFINED GGML_AMX) |  | ||||||
|     set(GGML_AMX ON) |  | ||||||
| endif() |  | ||||||
|  |  | ||||||
| if (NOT DEFINED GGML_CUDA_GRAPHS) | if (NOT DEFINED GGML_CUDA_GRAPHS) | ||||||
|     set(GGML_CUDA_GRAPHS_DEFAULT ON) |     set(GGML_CUDA_GRAPHS_DEFAULT ON) | ||||||
| endif() | endif() | ||||||
|   | |||||||
| @@ -88,5 +88,5 @@ let package = Package( | |||||||
|             linkerSettings: linkerSettings |             linkerSettings: linkerSettings | ||||||
|         ) |         ) | ||||||
|     ], |     ], | ||||||
|     cxxLanguageStandard: .cxx11 |     cxxLanguageStandard: .cxx17 | ||||||
| ) | ) | ||||||
|   | |||||||
| @@ -96,6 +96,7 @@ option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF) | |||||||
| option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) | option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) | ||||||
|  |  | ||||||
| option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB}) | option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB}) | ||||||
|  | option(GGML_AVX_VNNI    "ggml: enable AVX-VNNI"         OFF) | ||||||
| option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB}) | option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB}) | ||||||
| option(GGML_AVX512      "ggml: enable AVX512"           OFF) | option(GGML_AVX512      "ggml: enable AVX512"           OFF) | ||||||
| option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF) | option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF) | ||||||
|   | |||||||
| @@ -211,27 +211,45 @@ extern "C" { | |||||||
|     GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); |     GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); | ||||||
|  |  | ||||||
|     // Add backend dynamic loading support to the backend |     // Add backend dynamic loading support to the backend | ||||||
|     typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); |  | ||||||
|  |  | ||||||
|     #ifdef GGML_BACKEND_DL |     // Initialize the backend | ||||||
|         #ifdef __cplusplus |     typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); | ||||||
|         #    define GGML_BACKEND_DL_IMPL(reg_fn)                                 \ |     // Optional: obtain a score for the backend based on the system configuration | ||||||
|  |     // Higher scores are preferred, 0 means the backend is not supported in the current system | ||||||
|  |     typedef int                (*ggml_backend_score_t)(void); | ||||||
|  |  | ||||||
|  | #ifdef GGML_BACKEND_DL | ||||||
|  | #    ifdef __cplusplus | ||||||
|  | #        define GGML_BACKEND_DL_IMPL(reg_fn)                             \ | ||||||
|             extern "C" {                                                 \ |             extern "C" {                                                 \ | ||||||
|             GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ |             GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ | ||||||
|             }                                                            \ |             }                                                            \ | ||||||
|             ggml_backend_reg_t ggml_backend_init(void) {                 \ |             ggml_backend_reg_t ggml_backend_init(void) {                 \ | ||||||
|                 return reg_fn();                                         \ |                 return reg_fn();                                         \ | ||||||
|             } |             } | ||||||
|         #else | #        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)       \ | ||||||
|         #    define GGML_BACKEND_DL_IMPL(reg_fn)                             \ |             extern "C" {                                   \ | ||||||
|  |             GGML_BACKEND_API int ggml_backend_score(void); \ | ||||||
|  |             }                                              \ | ||||||
|  |             int ggml_backend_score(void) {                 \ | ||||||
|  |                 return score_fn();                         \ | ||||||
|  |             } | ||||||
|  | #    else | ||||||
|  | #        define GGML_BACKEND_DL_IMPL(reg_fn)                              \ | ||||||
|             GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void);  \ |             GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void);  \ | ||||||
|             ggml_backend_reg_t                  ggml_backend_init(void) { \ |             ggml_backend_reg_t                  ggml_backend_init(void) { \ | ||||||
|                 return reg_fn();                                          \ |                 return reg_fn();                                          \ | ||||||
|             } |             } | ||||||
|         #endif | #        define GGML_BACKEND_DL_SCORE_IMPL(score_fn)        \ | ||||||
|     #else |             GGML_BACKEND_API int ggml_backend_score(void);  \ | ||||||
|     #    define GGML_BACKEND_DL_IMPL(reg_fn) |             int                  ggml_backend_score(void) { \ | ||||||
|     #endif |                 return score_fn();                          \ | ||||||
|  |             } | ||||||
|  | #    endif | ||||||
|  | #else | ||||||
|  | #    define GGML_BACKEND_DL_IMPL(reg_fn) | ||||||
|  | #    define GGML_BACKEND_DL_SCORE_IMPL(score_fn) | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #ifdef  __cplusplus | #ifdef  __cplusplus | ||||||
| } | } | ||||||
|   | |||||||
| @@ -2,8 +2,13 @@ | |||||||
| #include "ggml-backend.h" | #include "ggml-backend.h" | ||||||
| #include "ggml-impl.h" | #include "ggml-impl.h" | ||||||
| #include <algorithm> | #include <algorithm> | ||||||
|  | #include <codecvt> | ||||||
| #include <cstring> | #include <cstring> | ||||||
|  | #include <filesystem> | ||||||
|  | #include <locale> | ||||||
|  | #include <memory> | ||||||
| #include <string> | #include <string> | ||||||
|  | #include <type_traits> | ||||||
| #include <vector> | #include <vector> | ||||||
|  |  | ||||||
| #ifdef _WIN32 | #ifdef _WIN32 | ||||||
| @@ -57,9 +62,71 @@ | |||||||
| #include "ggml-kompute.h" | #include "ggml-kompute.h" | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | #ifdef _WIN32 | ||||||
|  |  | ||||||
|  | using dl_handle = std::remove_pointer_t<HMODULE>; | ||||||
|  |  | ||||||
|  | struct dl_handle_deleter { | ||||||
|  |     void operator()(HMODULE handle) { | ||||||
|  |         FreeLibrary(handle); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | static dl_handle * dl_load_library(const std::wstring & path) { | ||||||
|  |     // suppress error dialogs for missing DLLs | ||||||
|  |     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); | ||||||
|  |     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); | ||||||
|  |  | ||||||
|  |     HMODULE handle = LoadLibraryW(path.c_str()); | ||||||
|  |  | ||||||
|  |     SetErrorMode(old_mode); | ||||||
|  |  | ||||||
|  |     return handle; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static dl_handle * dl_load_library(const std::string & path) { | ||||||
|  |     std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter; | ||||||
|  |     return dl_load_library(converter.from_bytes(path)); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void * dl_get_sym(dl_handle * handle, const char * name) { | ||||||
|  |     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); | ||||||
|  |     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); | ||||||
|  |  | ||||||
|  |     void * p = (void *) GetProcAddress(handle, name); | ||||||
|  |  | ||||||
|  |     SetErrorMode(old_mode); | ||||||
|  |  | ||||||
|  |     return p; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #else | ||||||
|  |  | ||||||
|  | using dl_handle = void; | ||||||
|  |  | ||||||
|  | struct dl_handle_deleter { | ||||||
|  |     void operator()(void * handle) { | ||||||
|  |         dlclose(handle); | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | static void * dl_load_library(const std::string & path) { | ||||||
|  |     dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL); | ||||||
|  |  | ||||||
|  |     return handle; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void * dl_get_sym(dl_handle * handle, const char * name) { | ||||||
|  |     return dlsym(handle, name); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>; | ||||||
|  |  | ||||||
| struct ggml_backend_reg_entry { | struct ggml_backend_reg_entry { | ||||||
|     ggml_backend_reg_t reg; |     ggml_backend_reg_t reg; | ||||||
|     void * handle; |     dl_handle_ptr handle; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| struct ggml_backend_registry { | struct ggml_backend_registry { | ||||||
| @@ -97,13 +164,16 @@ struct ggml_backend_registry { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     ~ggml_backend_registry() { |     ~ggml_backend_registry() { | ||||||
|         while (!backends.empty()) { |         // FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources, | ||||||
|             // use silent since the log system may have been destroyed at this point |         // since backend threads may still be running and accessing resources from the dynamic library | ||||||
|             unload_backend(backends.back().reg, true); |         for (auto & entry : backends) { | ||||||
|  |             if (entry.handle) { | ||||||
|  |                 entry.handle.release(); // NOLINT | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) { |     void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) { | ||||||
|         if (!reg) { |         if (!reg) { | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
| @@ -112,7 +182,7 @@ struct ggml_backend_registry { | |||||||
|         GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", |         GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", | ||||||
|             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); |             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); | ||||||
| #endif | #endif | ||||||
|         backends.push_back({ reg, handle }); |         backends.push_back({ reg, std::move(handle) }); | ||||||
|         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { |         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { | ||||||
|             register_device(ggml_backend_reg_dev_get(reg, i)); |             register_device(ggml_backend_reg_dev_get(reg, i)); | ||||||
|         } |         } | ||||||
| @@ -126,54 +196,31 @@ struct ggml_backend_registry { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     ggml_backend_reg_t load_backend(const char * path, bool silent) { |     ggml_backend_reg_t load_backend(const char * path, bool silent) { | ||||||
| #ifdef _WIN32 |         dl_handle_ptr handle { dl_load_library(path) }; | ||||||
|         // suppress error dialogs for missing DLLs |  | ||||||
|         DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); |  | ||||||
|         SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); |  | ||||||
|  |  | ||||||
|         HMODULE handle = LoadLibraryA(path); |  | ||||||
|  |  | ||||||
|         if (!handle) { |         if (!handle) { | ||||||
|             if (!silent) { |             if (!silent) { | ||||||
|                 GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); |                 GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path); | ||||||
|             } |             } | ||||||
|             SetErrorMode(old_mode); |  | ||||||
|             return nullptr; |             return nullptr; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); |         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); | ||||||
|  |         if (score_fn && score_fn() == 0) { | ||||||
|         SetErrorMode(old_mode); |  | ||||||
|  |  | ||||||
|         if (!backend_init) { |  | ||||||
|             if (!silent) { |             if (!silent) { | ||||||
|                 GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); |                 GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path); | ||||||
|             } |             } | ||||||
|             FreeLibrary(handle); |  | ||||||
|             return nullptr; |             return nullptr; | ||||||
|         } |         } | ||||||
| #else |  | ||||||
|         void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); |  | ||||||
|  |  | ||||||
|         if (!handle) { |         auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init"); | ||||||
|  |         if (!backend_init_fn) { | ||||||
|             if (!silent) { |             if (!silent) { | ||||||
|                 GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); |                 GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path); | ||||||
|             } |             } | ||||||
|             return nullptr; |             return nullptr; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); |         ggml_backend_reg_t reg = backend_init_fn(); | ||||||
|  |  | ||||||
|         if (!backend_init) { |  | ||||||
|             if (!silent) { |  | ||||||
|                 GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); |  | ||||||
|             } |  | ||||||
|             dlclose(handle); |  | ||||||
|             return nullptr; |  | ||||||
|         } |  | ||||||
| #endif |  | ||||||
|         ggml_backend_reg_t reg = backend_init(); |  | ||||||
|  |  | ||||||
|         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { |         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { | ||||||
|             if (!silent) { |             if (!silent) { | ||||||
|                 if (!reg) { |                 if (!reg) { | ||||||
| @@ -183,22 +230,19 @@ struct ggml_backend_registry { | |||||||
|                         __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); |                         __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
| #ifdef _WIN32 |  | ||||||
|             FreeLibrary(handle); |  | ||||||
| #else |  | ||||||
|             dlclose(handle); |  | ||||||
| #endif |  | ||||||
|             return nullptr; |             return nullptr; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); |         GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); | ||||||
|         register_backend(reg, handle); |  | ||||||
|  |         register_backend(reg, std::move(handle)); | ||||||
|  |  | ||||||
|         return reg; |         return reg; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     void unload_backend(ggml_backend_reg_t reg, bool silent) { |     void unload_backend(ggml_backend_reg_t reg, bool silent) { | ||||||
|         auto it = std::find_if(backends.begin(), backends.end(), |         auto it = std::find_if(backends.begin(), backends.end(), | ||||||
|                                 [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); |                                [reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; }); | ||||||
|  |  | ||||||
|         if (it == backends.end()) { |         if (it == backends.end()) { | ||||||
|             if (!silent) { |             if (!silent) { | ||||||
| @@ -217,15 +261,6 @@ struct ggml_backend_registry { | |||||||
|                             [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), |                             [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), | ||||||
|             devices.end()); |             devices.end()); | ||||||
|  |  | ||||||
|         // unload library |  | ||||||
|         if (it->handle) { |  | ||||||
| #ifdef _WIN32 |  | ||||||
|             FreeLibrary((HMODULE) it->handle); |  | ||||||
| #else |  | ||||||
|             dlclose(it->handle); |  | ||||||
| #endif |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // remove backend |         // remove backend | ||||||
|         backends.erase(it); |         backends.erase(it); | ||||||
|     } |     } | ||||||
| @@ -341,12 +376,7 @@ void ggml_backend_unload(ggml_backend_reg_t reg) { | |||||||
|     get_reg().unload_backend(reg, true); |     get_reg().unload_backend(reg, true); | ||||||
| } | } | ||||||
|  |  | ||||||
| void ggml_backend_load_all() { | static std::string get_executable_path() { | ||||||
|     std::vector<std::string> search_prefix; |  | ||||||
|  |  | ||||||
|     // add the executable directory to the search path |  | ||||||
|     // FIXME: this is convenient for development, but it should probably be disabled in production |  | ||||||
|  |  | ||||||
| #if defined(__APPLE__) | #if defined(__APPLE__) | ||||||
|     // get executable path |     // get executable path | ||||||
|     std::vector<char> path; |     std::vector<char> path; | ||||||
| @@ -364,7 +394,7 @@ void ggml_backend_load_all() { | |||||||
|     if (last_slash != std::string::npos) { |     if (last_slash != std::string::npos) { | ||||||
|         base_path = base_path.substr(0, last_slash); |         base_path = base_path.substr(0, last_slash); | ||||||
|     } |     } | ||||||
|     search_prefix.push_back(base_path + "/"); |     return base_path + "/"; | ||||||
| #elif defined(__linux__) | #elif defined(__linux__) | ||||||
|     std::string base_path = "."; |     std::string base_path = "."; | ||||||
|     std::vector<char> path(1024); |     std::vector<char> path(1024); | ||||||
| @@ -386,38 +416,104 @@ void ggml_backend_load_all() { | |||||||
|         path.resize(path.size() * 2); |         path.resize(path.size() * 2); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     search_prefix.push_back(base_path + "/"); |     return base_path + "/"; | ||||||
|  | #elif defined(_WIN32) | ||||||
|  |     std::vector<char> path(MAX_PATH); | ||||||
|  |     DWORD len = GetModuleFileNameA(NULL, path.data(), path.size()); | ||||||
|  |     if (len == 0) { | ||||||
|  |         return ""; | ||||||
|  |     } | ||||||
|  |     std::string base_path(path.data(), len); | ||||||
|  |     // remove executable name | ||||||
|  |     auto last_slash = base_path.find_last_of('\\'); | ||||||
|  |     if (last_slash != std::string::npos) { | ||||||
|  |         base_path = base_path.substr(0, last_slash); | ||||||
|  |     } | ||||||
|  |     return base_path + "\\"; | ||||||
| #endif | #endif | ||||||
|  | } | ||||||
|     auto & reg = get_reg(); |  | ||||||
|  | static std::string backend_filename_prefix() { | ||||||
|     auto try_load = [&](const std::string & name) { | #ifdef _WIN32 | ||||||
|         std::string os_name; |     return "ggml-"; | ||||||
| #ifdef _WIN32 | #else | ||||||
|         os_name = "ggml-" + name + ".dll"; |     return "libggml-"; | ||||||
| #else | #endif | ||||||
|         os_name = "libggml-" + name + ".so"; | } | ||||||
| #endif |  | ||||||
|         if (reg.load_backend(os_name.c_str(), true)) { | static std::string backend_filename_suffix() { | ||||||
|             return; | #ifdef _WIN32 | ||||||
|         } |     return ".dll"; | ||||||
|         for (const auto & prefix : search_prefix) { | #else | ||||||
|             if (reg.load_backend((prefix + os_name).c_str(), true)) { |     return ".so"; | ||||||
|                 return; | #endif | ||||||
|             } | } | ||||||
|         } |  | ||||||
|     }; | static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) { | ||||||
|  |     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths | ||||||
|     try_load("amx"); |      // TODO: search system paths | ||||||
|     try_load("blas"); |     std::vector<std::string> search_paths = { "./", get_executable_path() }; | ||||||
|     try_load("cann"); |     std::string file_prefix = backend_filename_prefix() + name + "-"; | ||||||
|     try_load("cuda"); |  | ||||||
|     try_load("hip"); |     int best_score = 0; | ||||||
|     try_load("kompute"); |     std::string best_path; | ||||||
|     try_load("metal"); |  | ||||||
|     try_load("rpc"); |     namespace fs = std::filesystem; | ||||||
|     try_load("sycl"); |     for (const auto & search_path : search_paths) { | ||||||
|     try_load("vulkan"); |         if (!fs::exists(search_path)) { | ||||||
|     try_load("musa"); |             continue; | ||||||
|     try_load("cpu"); |         } | ||||||
|  |         for (const auto & entry : fs::directory_iterator(search_path)) { | ||||||
|  |             if (entry.is_regular_file()) { | ||||||
|  |                 std::string filename = entry.path().filename().string(); | ||||||
|  |                 std::string ext = entry.path().extension().string(); | ||||||
|  |                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) { | ||||||
|  |                     dl_handle_ptr handle { dl_load_library(entry.path().c_str()) }; | ||||||
|  |                     if (!handle && !silent) { | ||||||
|  |                         GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str()); | ||||||
|  |                     } | ||||||
|  |                     if (handle) { | ||||||
|  |                         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); | ||||||
|  |                         if (score_fn) { | ||||||
|  |                             int s = score_fn(); | ||||||
|  | #ifndef NDEBUG | ||||||
|  |                             GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s); | ||||||
|  | #endif | ||||||
|  |                             if (s > best_score) { | ||||||
|  |                                 best_score = s; | ||||||
|  |                                 best_path = entry.path().string(); | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if (best_score == 0) { | ||||||
|  |         // try to load the base backend | ||||||
|  |         for (const auto & search_path : search_paths) { | ||||||
|  |             std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix(); | ||||||
|  |             if (fs::exists(path)) { | ||||||
|  |                 return get_reg().load_backend(path.c_str(), silent); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         return nullptr; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     return get_reg().load_backend(best_path.c_str(), silent); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void ggml_backend_load_all() { | ||||||
|  |     ggml_backend_load_best("blas", true); | ||||||
|  |     ggml_backend_load_best("cann", true); | ||||||
|  |     ggml_backend_load_best("cuda", true); | ||||||
|  |     ggml_backend_load_best("hip", true); | ||||||
|  |     ggml_backend_load_best("kompute", true); | ||||||
|  |     ggml_backend_load_best("metal", true); | ||||||
|  |     ggml_backend_load_best("rpc", true); | ||||||
|  |     ggml_backend_load_best("sycl", true); | ||||||
|  |     ggml_backend_load_best("vulkan", true); | ||||||
|  |     ggml_backend_load_best("musa", true); | ||||||
|  |     ggml_backend_load_best("cpu", true); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -217,6 +217,12 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW | |||||||
|         elseif (GGML_AVX) |         elseif (GGML_AVX) | ||||||
|             list(APPEND ARCH_FLAGS /arch:AVX) |             list(APPEND ARCH_FLAGS /arch:AVX) | ||||||
|         endif() |         endif() | ||||||
|  |         if (GGML_AVX_VNNI) | ||||||
|  |             list(APPEND ARCH_DEFINITIONS __AVXVNNI__) | ||||||
|  |             if (CMAKE_C_COMPILER_ID STREQUAL "Clang") | ||||||
|  |                 list(APPEND ARCH_FLAGS -mavxvnni) | ||||||
|  |             endif() | ||||||
|  |         endif() | ||||||
|     else() |     else() | ||||||
|         if (GGML_NATIVE) |         if (GGML_NATIVE) | ||||||
|             list(APPEND ARCH_FLAGS -march=native) |             list(APPEND ARCH_FLAGS -march=native) | ||||||
| @@ -233,6 +239,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW | |||||||
|         if (GGML_AVX2) |         if (GGML_AVX2) | ||||||
|             list(APPEND ARCH_FLAGS -mavx2) |             list(APPEND ARCH_FLAGS -mavx2) | ||||||
|         endif() |         endif() | ||||||
|  |         if (GGML_AVX_VNNI) | ||||||
|  |             list(APPEND ARCH_FLAGS -mavxvnni) | ||||||
|  |         endif() | ||||||
|         if (GGML_AVX512) |         if (GGML_AVX512) | ||||||
|             list(APPEND ARCH_FLAGS -mavx512f) |             list(APPEND ARCH_FLAGS -mavx512f) | ||||||
|             list(APPEND ARCH_FLAGS -mavx512dq) |             list(APPEND ARCH_FLAGS -mavx512dq) | ||||||
| @@ -301,6 +310,10 @@ target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) | |||||||
| set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}") | set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS     "${ARCH_FLAGS}") | ||||||
| set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") | set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") | ||||||
|  |  | ||||||
|  | # the feature detection code must be compiled without any architecture flags | ||||||
|  | target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp) | ||||||
|  | # target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection | ||||||
|  |  | ||||||
| if (EMSCRIPTEN) | if (EMSCRIPTEN) | ||||||
|     set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") |     set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") | ||||||
| endif() | endif() | ||||||
|   | |||||||
| @@ -78,7 +78,6 @@ inline void parallel_for_ggml(const ggml_compute_params * params, int n, const f | |||||||
|     int tbegin, tend; |     int tbegin, tend; | ||||||
|     balance211(n, params->nth, params->ith, tbegin, tend); |     balance211(n, params->nth, params->ith, tbegin, tend); | ||||||
|     f(tbegin, tend); |     f(tbegin, tend); | ||||||
|     ggml_barrier(params->threadpool); // TODO: might not always be needed |  | ||||||
| } | } | ||||||
|  |  | ||||||
| // quantized types that have AMX support | // quantized types that have AMX support | ||||||
|   | |||||||
| @@ -1340,21 +1340,19 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K> | |||||||
|         __m512 vb[COLS]; |         __m512 vb[COLS]; | ||||||
|         __m512 vc[ROWS * COLS]; |         __m512 vc[ROWS * COLS]; | ||||||
|  |  | ||||||
|         auto loadc = [&](int idx) { |         auto loadc = [&](auto idx) { | ||||||
|             vc[idx] = _mm512_setzero_ps(); |             vc[idx] = _mm512_setzero_ps(); | ||||||
|         }; |         }; | ||||||
|         Unroll<ROWS * COLS>{}(loadc); |         Unroll<ROWS * COLS>{}(loadc); | ||||||
|  |  | ||||||
|         auto compute = [&](int idx, int k) { |         auto compute = [&](auto idx, auto k) { | ||||||
|             // TODO: use `constexpr` here to get rid of interger div |             constexpr int row = idx / COLS; | ||||||
|             // when upgraded to C++17 |             constexpr int col = idx % COLS; | ||||||
|             const int row = idx / COLS; |  | ||||||
|             const int col = idx % COLS; |  | ||||||
|  |  | ||||||
|             if (col == 0) { |             if constexpr (col == 0) { | ||||||
|                 va = _mm512_loadu_ps(A + row * K + k); |                 va = _mm512_loadu_ps(A + row * K + k); | ||||||
|             } |             } | ||||||
|             if (row == 0) { |             if constexpr (row == 0) { | ||||||
|                 vb[col] =  _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k))); |                 vb[col] =  _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(B + col * K + k))); | ||||||
|             } |             } | ||||||
|             vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); |             vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); | ||||||
| @@ -1364,9 +1362,9 @@ struct tinygemm_kernel_avx<float, ggml_fp16_t, float, BLOCK_M, BLOCK_N, BLOCK_K> | |||||||
|             Unroll<ROWS * COLS>{}(compute, k); |             Unroll<ROWS * COLS>{}(compute, k); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         auto storec = [&](int idx) { |         auto storec = [&](auto idx) { | ||||||
|             const int row = idx / COLS; |             constexpr int row = idx / COLS; | ||||||
|             const int col = idx % COLS; |             constexpr int col = idx % COLS; | ||||||
|             C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]); |             C[row * ldc + col] = _mm512_reduce_add_ps(vc[idx]); | ||||||
|         }; |         }; | ||||||
|         Unroll<ROWS * COLS>{}(storec); |         Unroll<ROWS * COLS>{}(storec); | ||||||
| @@ -1429,14 +1427,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO | |||||||
|         const __m512i off = _mm512_set1_epi8(8); |         const __m512i off = _mm512_set1_epi8(8); | ||||||
|         const __m512i lowMask = _mm512_set1_epi8(0xF); |         const __m512i lowMask = _mm512_set1_epi8(0xF); | ||||||
|  |  | ||||||
|         auto loadc = [&](int col) { |         auto loadc = [&](auto col) { | ||||||
|             vc[col] = _mm512_setzero_ps(); |             vc[col] = _mm512_setzero_ps(); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(loadc); |         Unroll<COLS>{}(loadc); | ||||||
|  |  | ||||||
|         auto compute = [&](int col, int i) { |         auto compute = [&](auto col, auto i) { | ||||||
|             // load a and compute compensation |             // load a and compute compensation | ||||||
|             if (col == 0) { |             if constexpr (col == 0) { | ||||||
|                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); |                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); | ||||||
|                 vcomp = _mm512_setzero_si512(); |                 vcomp = _mm512_setzero_si512(); | ||||||
|                 for (int k = 0; k < 8; ++k) { |                 for (int k = 0; k < 8; ++k) { | ||||||
| @@ -1468,7 +1466,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         //store to C |         //store to C | ||||||
|         auto storec = [&](int col) { |         auto storec = [&](auto col) { | ||||||
|             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); |             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(storec); |         Unroll<COLS>{}(storec); | ||||||
| @@ -1492,14 +1490,14 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K> | |||||||
|  |  | ||||||
|         const __m512i lowMask = _mm512_set1_epi8(0xF); |         const __m512i lowMask = _mm512_set1_epi8(0xF); | ||||||
|  |  | ||||||
|         auto loadc = [&](int col) { |         auto loadc = [&](auto col) { | ||||||
|             vc[col] = _mm512_setzero_ps(); |             vc[col] = _mm512_setzero_ps(); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(loadc); |         Unroll<COLS>{}(loadc); | ||||||
|  |  | ||||||
|         auto compute = [&](int col, int i) { |         auto compute = [&](auto col, auto i) { | ||||||
|             // load a |             // load a | ||||||
|             if (col == 0) { |             if constexpr (col == 0) { | ||||||
|                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); |                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); | ||||||
|                 for (int k = 0; k < 8; ++k) { |                 for (int k = 0; k < 8; ++k) { | ||||||
|                     va[k] = _mm512_set1_epi32(a_ptr[k]); |                     va[k] = _mm512_set1_epi32(a_ptr[k]); | ||||||
| @@ -1533,7 +1531,7 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K> | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         //store to C |         //store to C | ||||||
|         auto storec = [&](int col) { |         auto storec = [&](auto col) { | ||||||
|             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); |             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(storec); |         Unroll<COLS>{}(storec); | ||||||
| @@ -1564,14 +1562,14 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO | |||||||
|         // |         // | ||||||
|         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80)); |         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80)); | ||||||
|  |  | ||||||
|         auto loadc = [&](int col) { |         auto loadc = [&](auto col) { | ||||||
|             vc[col] = _mm512_setzero_ps(); |             vc[col] = _mm512_setzero_ps(); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(loadc); |         Unroll<COLS>{}(loadc); | ||||||
|  |  | ||||||
|         auto compute = [&](int col, int i) { |         auto compute = [&](auto col, auto i) { | ||||||
|             // load a and add offset 128 |             // load a and add offset 128 | ||||||
|             if (col == 0) { |             if constexpr (col == 0) { | ||||||
|                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); |                 const int32_t * a_ptr = reinterpret_cast<const int32_t *>(A[0 * KB + i].qs); | ||||||
|                 for (int k = 0; k < 8; ++k) { |                 for (int k = 0; k < 8; ++k) { | ||||||
|                     va[k] = _mm512_set1_epi32(a_ptr[k]); |                     va[k] = _mm512_set1_epi32(a_ptr[k]); | ||||||
| @@ -1604,7 +1602,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         //store to C |         //store to C | ||||||
|         auto storec = [&](int col) { |         auto storec = [&](auto col) { | ||||||
|             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); |             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(storec); |         Unroll<COLS>{}(storec); | ||||||
| @@ -1636,7 +1634,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO | |||||||
|  |  | ||||||
|         const __m512i lowMask = _mm512_set1_epi8(0xF); |         const __m512i lowMask = _mm512_set1_epi8(0xF); | ||||||
|  |  | ||||||
|         auto loadc = [&](int col) { |         auto loadc = [&](auto col) { | ||||||
|             vc[col] = _mm512_setzero_ps(); |             vc[col] = _mm512_setzero_ps(); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(loadc); |         Unroll<COLS>{}(loadc); | ||||||
| @@ -1650,9 +1648,9 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO | |||||||
|         //     int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8 |         //     int16 {k/2, n, 2}, viewed as 2d {k/2, 2n}, k = 8 | ||||||
|         //     from {16,  8} to {4, 32} |         //     from {16,  8} to {4, 32} | ||||||
|         // |         // | ||||||
|         auto compute = [&](int col, int i) { |         auto compute = [&](auto col, auto i) { | ||||||
|             // load a |             // load a | ||||||
|             if (col == 0) { |             if constexpr (col == 0) { | ||||||
|                 for (int k_group = 0; k_group < QK_K / 32; ++k_group) { |                 for (int k_group = 0; k_group < QK_K / 32; ++k_group) { | ||||||
|                     va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); |                     va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); | ||||||
|                 } |                 } | ||||||
| @@ -1704,7 +1702,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q4_K, float, BLOCK_M, BLOCK_N, BLO | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         //store to C |         //store to C | ||||||
|         auto storec = [&](int col) { |         auto storec = [&](auto col) { | ||||||
|             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); |             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(storec); |         Unroll<COLS>{}(storec); | ||||||
| @@ -1737,15 +1735,15 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO | |||||||
|  |  | ||||||
|         const __m512i lowMask = _mm512_set1_epi8(0xF); |         const __m512i lowMask = _mm512_set1_epi8(0xF); | ||||||
|  |  | ||||||
|         auto loadc = [&](int col) { |         auto loadc = [&](auto col) { | ||||||
|             vc[col] = _mm512_setzero_ps(); |             vc[col] = _mm512_setzero_ps(); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(loadc); |         Unroll<COLS>{}(loadc); | ||||||
|  |  | ||||||
|         // Q5_K and Q4_K shares the same vnni formats, refer to notes above. |         // Q5_K and Q4_K shares the same vnni formats, refer to notes above. | ||||||
|         auto compute = [&](int col, int i) { |         auto compute = [&](auto col, auto i) { | ||||||
|             // load a |             // load a | ||||||
|             if (col == 0) { |             if constexpr (col == 0) { | ||||||
|                 for (int k_group = 0; k_group < QK_K / 32; ++k_group) { |                 for (int k_group = 0; k_group < QK_K / 32; ++k_group) { | ||||||
|                     va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); |                     va[k_group] = _mm512_castsi256_si512(_mm256_loadu_si256((const __m256i *)(A[0 * KB + i].qs + k_group * 32))); | ||||||
|                 } |                 } | ||||||
| @@ -1810,7 +1808,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q5_K, float, BLOCK_M, BLOCK_N, BLO | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         //store to C |         //store to C | ||||||
|         auto storec = [&](int col) { |         auto storec = [&](auto col) { | ||||||
|             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); |             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(storec); |         Unroll<COLS>{}(storec); | ||||||
| @@ -1843,13 +1841,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_q6_K, float, BLOCK_M, BLOCK_N, BLO | |||||||
|         const __m512i m32s = _mm512_set1_epi32(32); |         const __m512i m32s = _mm512_set1_epi32(32); | ||||||
|         const __m512i lowMask = _mm512_set1_epi8(0xF); |         const __m512i lowMask = _mm512_set1_epi8(0xF); | ||||||
|  |  | ||||||
|         auto loadc = [&](int col) { |         auto loadc = [&](auto col) { | ||||||
|             vc[col] = _mm512_setzero_ps(); |             vc[col] = _mm512_setzero_ps(); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(loadc); |         Unroll<COLS>{}(loadc); | ||||||
|  |  | ||||||
|         auto compute = [&](int col, int i) { |         auto compute = [&](auto col, auto i) { | ||||||
|             if (col == 0) { |             if constexpr (col == 0) { | ||||||
|                 // load a |                 // load a | ||||||
|                 va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0)); |                 va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0)); | ||||||
|                 va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64)); |                 va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64)); | ||||||
| @@ -1961,13 +1959,13 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B | |||||||
|         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80)); |         const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80)); | ||||||
|         const __m512i values256 = _mm512_add_epi8(values128, off); |         const __m512i values256 = _mm512_add_epi8(values128, off); | ||||||
|  |  | ||||||
|         auto loadc = [&](int col) { |         auto loadc = [&](auto col) { | ||||||
|             vc[col] = _mm512_setzero_ps(); |             vc[col] = _mm512_setzero_ps(); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(loadc); |         Unroll<COLS>{}(loadc); | ||||||
|  |  | ||||||
|         auto compute = [&](int col, int i) { |         auto compute = [&](auto col, auto i) { | ||||||
|             if (col == 0) { |             if constexpr (col == 0) { | ||||||
|                 // load a |                 // load a | ||||||
|                 va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0)); |                 va[0] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +   0)); | ||||||
|                 va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64)); |                 va[1] = _mm512_loadu_si512((const __m512i *)(A[0 * KB + i].qs +  64)); | ||||||
| @@ -2017,7 +2015,7 @@ struct tinygemm_kernel_vnni<block_q8_K, block_iq4_xs, float, BLOCK_M, BLOCK_N, B | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         //store to C |         //store to C | ||||||
|         auto storec = [&](int col) { |         auto storec = [&](auto col) { | ||||||
|             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); |             _mm512_storeu_ps((__m512i*)(C + 0 * ldc + col * 16), vc[col]); | ||||||
|         }; |         }; | ||||||
|         Unroll<COLS>{}(storec); |         Unroll<COLS>{}(storec); | ||||||
|   | |||||||
							
								
								
									
										298
									
								
								ggml/src/ggml-cpu/cpu-feats-x86.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										298
									
								
								ggml/src/ggml-cpu/cpu-feats-x86.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,298 @@ | |||||||
|  | #include "ggml-cpu.h" | ||||||
|  | #include "ggml-backend-impl.h" | ||||||
|  |  | ||||||
|  | #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) | ||||||
|  |  | ||||||
|  | #ifdef _MSC_VER | ||||||
|  | #include <intrin.h> | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #include <cstring> | ||||||
|  | #include <vector> | ||||||
|  | #include <bitset> | ||||||
|  | #include <array> | ||||||
|  | #include <string> | ||||||
|  |  | ||||||
|  | struct cpuid_x86 { | ||||||
|  |     bool SSE3(void) { return f_1_ecx[0]; } | ||||||
|  |     bool PCLMULQDQ(void) { return f_1_ecx[1]; } | ||||||
|  |     bool MONITOR(void) { return f_1_ecx[3]; } | ||||||
|  |     bool SSSE3(void) { return f_1_ecx[9]; } | ||||||
|  |     bool FMA(void) { return f_1_ecx[12]; } | ||||||
|  |     bool CMPXCHG16B(void) { return f_1_ecx[13]; } | ||||||
|  |     bool SSE41(void) { return f_1_ecx[19]; } | ||||||
|  |     bool SSE42(void) { return f_1_ecx[20]; } | ||||||
|  |     bool MOVBE(void) { return f_1_ecx[22]; } | ||||||
|  |     bool POPCNT(void) { return f_1_ecx[23]; } | ||||||
|  |     bool AES(void) { return f_1_ecx[25]; } | ||||||
|  |     bool XSAVE(void) { return f_1_ecx[26]; } | ||||||
|  |     bool OSXSAVE(void) { return f_1_ecx[27]; } | ||||||
|  |     bool AVX(void) { return f_1_ecx[28]; } | ||||||
|  |     bool F16C(void) { return f_1_ecx[29]; } | ||||||
|  |     bool RDRAND(void) { return f_1_ecx[30]; } | ||||||
|  |  | ||||||
|  |     bool MSR(void) { return f_1_edx[5]; } | ||||||
|  |     bool CX8(void) { return f_1_edx[8]; } | ||||||
|  |     bool SEP(void) { return f_1_edx[11]; } | ||||||
|  |     bool CMOV(void) { return f_1_edx[15]; } | ||||||
|  |     bool CLFSH(void) { return f_1_edx[19]; } | ||||||
|  |     bool MMX(void) { return f_1_edx[23]; } | ||||||
|  |     bool FXSR(void) { return f_1_edx[24]; } | ||||||
|  |     bool SSE(void) { return f_1_edx[25]; } | ||||||
|  |     bool SSE2(void) { return f_1_edx[26]; } | ||||||
|  |  | ||||||
|  |     bool FSGSBASE(void) { return f_7_ebx[0]; } | ||||||
|  |     bool BMI1(void) { return f_7_ebx[3]; } | ||||||
|  |     bool HLE(void) { return is_intel && f_7_ebx[4]; } | ||||||
|  |     bool AVX2(void) { return f_7_ebx[5]; } | ||||||
|  |     bool BMI2(void) { return f_7_ebx[8]; } | ||||||
|  |     bool ERMS(void) { return f_7_ebx[9]; } | ||||||
|  |     bool INVPCID(void) { return f_7_ebx[10]; } | ||||||
|  |     bool RTM(void) { return is_intel && f_7_ebx[11]; } | ||||||
|  |     bool AVX512F(void) { return f_7_ebx[16]; } | ||||||
|  |     bool RDSEED(void) { return f_7_ebx[18]; } | ||||||
|  |     bool ADX(void) { return f_7_ebx[19]; } | ||||||
|  |     bool AVX512PF(void) { return f_7_ebx[26]; } | ||||||
|  |     bool AVX512ER(void) { return f_7_ebx[27]; } | ||||||
|  |     bool AVX512CD(void) { return f_7_ebx[28]; } | ||||||
|  |     bool SHA(void) { return f_7_ebx[29]; } | ||||||
|  |  | ||||||
|  |     bool PREFETCHWT1(void) { return f_7_ecx[0]; } | ||||||
|  |  | ||||||
|  |     bool LAHF(void) { return f_81_ecx[0]; } | ||||||
|  |     bool LZCNT(void) { return is_intel && f_81_ecx[5]; } | ||||||
|  |     bool ABM(void) { return is_amd && f_81_ecx[5]; } | ||||||
|  |     bool SSE4a(void) { return is_amd && f_81_ecx[6]; } | ||||||
|  |     bool XOP(void) { return is_amd && f_81_ecx[11]; } | ||||||
|  |     bool TBM(void) { return is_amd && f_81_ecx[21]; } | ||||||
|  |  | ||||||
|  |     bool SYSCALL(void) { return is_intel && f_81_edx[11]; } | ||||||
|  |     bool MMXEXT(void) { return is_amd && f_81_edx[22]; } | ||||||
|  |     bool RDTSCP(void) { return is_intel && f_81_edx[27]; } | ||||||
|  |     bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; } | ||||||
|  |     bool _3DNOW(void) { return is_amd && f_81_edx[31]; } | ||||||
|  |  | ||||||
|  |     bool AVX512_VBMI(void) { return f_7_ecx[1]; } | ||||||
|  |     bool AVX512_VNNI(void) { return f_7_ecx[11]; } | ||||||
|  |     bool AVX512_FP16(void) { return f_7_edx[23]; } | ||||||
|  |     bool AVX512_BF16(void) { return f_7_1_eax[5]; } | ||||||
|  |     bool AVX_VNNI(void) { return f_7_1_eax[4]; } | ||||||
|  |  | ||||||
|  |     bool AMX_TILE(void) { return f_7_edx[24]; } | ||||||
|  |     bool AMX_INT8(void) { return f_7_edx[25]; } | ||||||
|  |     bool AMX_FP16(void) { return f_7_1_eax[21]; } | ||||||
|  |     bool AMX_BF16(void) { return f_7_edx[22]; } | ||||||
|  |  | ||||||
|  | #ifdef _MSC_VER | ||||||
|  |     static void cpuid(int cpu_info[4], int eax) { | ||||||
|  |         __cpuid(cpu_info, eax); | ||||||
|  |     } | ||||||
|  |     static void cpuidex(int cpu_info[4], int eax, int ecx) { | ||||||
|  |         __cpuidex(cpu_info, eax, ecx); | ||||||
|  |     } | ||||||
|  | #else | ||||||
|  |     static void cpuid(int cpu_info[4], int eax) { | ||||||
|  |         __asm__ __volatile__( | ||||||
|  |             "cpuid" | ||||||
|  |             : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) | ||||||
|  |             : "a"(eax), "c"(0)); | ||||||
|  |     } | ||||||
|  |     static void cpuidex(int cpu_info[4], int eax, int ecx) { | ||||||
|  |         __asm__ __volatile__( | ||||||
|  |             "cpuid" | ||||||
|  |             : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) | ||||||
|  |             : "a"(eax), "c"(ecx)); | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |     cpuid_x86() { | ||||||
|  |         std::array<int, 4> cpui; | ||||||
|  |         std::vector<std::array<int, 4>> data; | ||||||
|  |  | ||||||
|  |         // calling __cpuid with 0x0 as the function_id argument | ||||||
|  |         // gets the number of the highest valid function ID. | ||||||
|  |         cpuid(cpui.data(), 0); | ||||||
|  |         int n_ids = cpui[0]; | ||||||
|  |  | ||||||
|  |         for (int i = 0; i <= n_ids; ++i) { | ||||||
|  |             cpuidex(cpui.data(), i, 0); | ||||||
|  |             data.push_back(cpui); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // capture vendor string | ||||||
|  |         char vendor[0x20] = {}; | ||||||
|  |         *reinterpret_cast<int *>(vendor)     = data[0][1]; | ||||||
|  |         *reinterpret_cast<int *>(vendor + 4) = data[0][3]; | ||||||
|  |         *reinterpret_cast<int *>(vendor + 8) = data[0][2]; | ||||||
|  |         this->vendor = vendor; | ||||||
|  |         if (this->vendor == "GenuineIntel") { | ||||||
|  |             is_intel = true; | ||||||
|  |         } else if (this->vendor == "AuthenticAMD") { | ||||||
|  |             is_amd = true; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // load bitset with flags for function 0x00000001 | ||||||
|  |         if (n_ids >= 1) { | ||||||
|  |             f_1_ecx = data[1][2]; | ||||||
|  |             f_1_edx = data[1][3]; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // load bitset with flags for function 0x00000007 | ||||||
|  |         if (n_ids >= 7) { | ||||||
|  |             f_7_ebx = data[7][1]; | ||||||
|  |             f_7_ecx = data[7][2]; | ||||||
|  |             f_7_edx = data[7][3]; | ||||||
|  |             cpuidex(cpui.data(), 7, 1); | ||||||
|  |             f_7_1_eax = cpui[0]; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // calling __cpuid with 0x80000000 as the function_id argument | ||||||
|  |         // gets the number of the highest valid extended ID. | ||||||
|  |         cpuid(cpui.data(), 0x80000000); | ||||||
|  |         unsigned int n_ex_ids = cpui[0]; | ||||||
|  |  | ||||||
|  |         std::vector<std::array<int, 4>> ext_data; | ||||||
|  |         for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) { | ||||||
|  |             cpuidex(cpui.data(), i, 0); | ||||||
|  |             ext_data.push_back(cpui); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // load bitset with flags for function 0x80000001 | ||||||
|  |         if (n_ex_ids >= 0x80000001) { | ||||||
|  |             f_81_ecx = ext_data[1][2]; | ||||||
|  |             f_81_edx = ext_data[1][3]; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // interpret CPU brand string if reported | ||||||
|  |         char brand[0x40] = {}; | ||||||
|  |         if (n_ex_ids >= 0x80000004) { | ||||||
|  |             std::memcpy(brand, ext_data[2].data(), sizeof(cpui)); | ||||||
|  |             std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui)); | ||||||
|  |             std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui)); | ||||||
|  |             this->brand = brand; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     bool is_intel = false; | ||||||
|  |     bool is_amd = false; | ||||||
|  |     std::string vendor; | ||||||
|  |     std::string brand; | ||||||
|  |     std::bitset<32> f_1_ecx; | ||||||
|  |     std::bitset<32> f_1_edx; | ||||||
|  |     std::bitset<32> f_7_ebx; | ||||||
|  |     std::bitset<32> f_7_ecx; | ||||||
|  |     std::bitset<32> f_7_edx; | ||||||
|  |     std::bitset<32> f_7_1_eax; | ||||||
|  |     std::bitset<32> f_81_ecx; | ||||||
|  |     std::bitset<32> f_81_edx; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | #if 0 | ||||||
|  | void test_x86_is() { | ||||||
|  |     cpuid_x86 is; | ||||||
|  |     printf("CPU Vendor: %s\n", is.vendor.c_str()); | ||||||
|  |     printf("Brand: %s\n", is.brand.c_str()); | ||||||
|  |     printf("is_intel: %d\n", is.is_intel); | ||||||
|  |     printf("is_amd: %d\n", is.is_amd); | ||||||
|  |     printf("sse3: %d\n", is.SSE3()); | ||||||
|  |     printf("pclmulqdq: %d\n", is.PCLMULQDQ()); | ||||||
|  |     printf("ssse3: %d\n", is.SSSE3()); | ||||||
|  |     printf("fma: %d\n", is.FMA()); | ||||||
|  |     printf("cmpxchg16b: %d\n", is.CMPXCHG16B()); | ||||||
|  |     printf("sse41: %d\n", is.SSE41()); | ||||||
|  |     printf("sse42: %d\n", is.SSE42()); | ||||||
|  |     printf("movbe: %d\n", is.MOVBE()); | ||||||
|  |     printf("popcnt: %d\n", is.POPCNT()); | ||||||
|  |     printf("aes: %d\n", is.AES()); | ||||||
|  |     printf("xsave: %d\n", is.XSAVE()); | ||||||
|  |     printf("osxsave: %d\n", is.OSXSAVE()); | ||||||
|  |     printf("avx: %d\n", is.AVX()); | ||||||
|  |     printf("f16c: %d\n", is.F16C()); | ||||||
|  |     printf("rdrand: %d\n", is.RDRAND()); | ||||||
|  |     printf("msr: %d\n", is.MSR()); | ||||||
|  |     printf("cx8: %d\n", is.CX8()); | ||||||
|  |     printf("sep: %d\n", is.SEP()); | ||||||
|  |     printf("cmov: %d\n", is.CMOV()); | ||||||
|  |     printf("clflush: %d\n", is.CLFSH()); | ||||||
|  |     printf("mmx: %d\n", is.MMX()); | ||||||
|  |     printf("fxsr: %d\n", is.FXSR()); | ||||||
|  |     printf("sse: %d\n", is.SSE()); | ||||||
|  |     printf("sse2: %d\n", is.SSE2()); | ||||||
|  |     printf("fsgsbase: %d\n", is.FSGSBASE()); | ||||||
|  |     printf("bmi1: %d\n", is.BMI1()); | ||||||
|  |     printf("hle: %d\n", is.HLE()); | ||||||
|  |     printf("avx2: %d\n", is.AVX2()); | ||||||
|  |     printf("bmi2: %d\n", is.BMI2()); | ||||||
|  |     printf("erms: %d\n", is.ERMS()); | ||||||
|  |     printf("invpcid: %d\n", is.INVPCID()); | ||||||
|  |     printf("rtm: %d\n", is.RTM()); | ||||||
|  |     printf("avx512f: %d\n", is.AVX512F()); | ||||||
|  |     printf("rdseed: %d\n", is.RDSEED()); | ||||||
|  |     printf("adx: %d\n", is.ADX()); | ||||||
|  |     printf("avx512pf: %d\n", is.AVX512PF()); | ||||||
|  |     printf("avx512er: %d\n", is.AVX512ER()); | ||||||
|  |     printf("avx512cd: %d\n", is.AVX512CD()); | ||||||
|  |     printf("sha: %d\n", is.SHA()); | ||||||
|  |     printf("prefetchwt1: %d\n", is.PREFETCHWT1()); | ||||||
|  |     printf("lahf: %d\n", is.LAHF()); | ||||||
|  |     printf("lzcnt: %d\n", is.LZCNT()); | ||||||
|  |     printf("abm: %d\n", is.ABM()); | ||||||
|  |     printf("sse4a: %d\n", is.SSE4a()); | ||||||
|  |     printf("xop: %d\n", is.XOP()); | ||||||
|  |     printf("tbm: %d\n", is.TBM()); | ||||||
|  |     printf("syscall: %d\n", is.SYSCALL()); | ||||||
|  |     printf("mmxext: %d\n", is.MMXEXT()); | ||||||
|  |     printf("rdtscp: %d\n", is.RDTSCP()); | ||||||
|  |     printf("3dnowext: %d\n", is._3DNOWEXT()); | ||||||
|  |     printf("3dnow: %d\n", is._3DNOW()); | ||||||
|  |     printf("avx512_vbmi: %d\n", is.AVX512_VBMI()); | ||||||
|  |     printf("avx512_vnni: %d\n", is.AVX512_VNNI()); | ||||||
|  |     printf("avx512_fp16: %d\n", is.AVX512_FP16()); | ||||||
|  |     printf("avx512_bf16: %d\n", is.AVX512_BF16()); | ||||||
|  |     printf("amx_tile: %d\n", is.AMX_TILE()); | ||||||
|  |     printf("amx_int8: %d\n", is.AMX_INT8()); | ||||||
|  |     printf("amx_fp16: %d\n", is.AMX_FP16()); | ||||||
|  |     printf("amx_bf16: %d\n", is.AMX_BF16()); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | static int ggml_backend_cpu_x86_score() { | ||||||
|  |     // FIXME: this does not check for OS support | ||||||
|  |  | ||||||
|  |     cpuid_x86 is; | ||||||
|  |     // if the CPU backend was built with any features not supported by the current CPU, it cannot be used | ||||||
|  |     if (ggml_cpu_has_fma() && !is.FMA()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_avx() && !is.AVX()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; } | ||||||
|  |     if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; } | ||||||
|  |  | ||||||
|  |     // calculate a backend score based on the supported features | ||||||
|  |     // more important features have a higher weight | ||||||
|  |     int score = 0; | ||||||
|  |     score +=  ggml_cpu_has_fma        () * 1; | ||||||
|  |     score +=  ggml_cpu_has_f16c       () * 1<<1; | ||||||
|  |     score +=  ggml_cpu_has_ssse3      () * 1<<2; | ||||||
|  |     score +=  ggml_cpu_has_sse3       () * 1<<3; | ||||||
|  |     score +=  ggml_cpu_has_avx_vnni   () * 1<<4; | ||||||
|  |     score +=  ggml_cpu_has_avx        () * 1<<5; | ||||||
|  |     score +=  ggml_cpu_has_avx2       () * 1<<6; | ||||||
|  |     score +=  ggml_cpu_has_avx512     () * 1<<7; | ||||||
|  |     // score +=  ggml_cpu_has_avx512_vbmi() * 1<<8; // not used | ||||||
|  |     score +=  ggml_cpu_has_avx512_bf16() * 1<<9; | ||||||
|  |     score +=  ggml_cpu_has_avx512_vnni() * 1<<10; | ||||||
|  |     score +=  ggml_cpu_has_amx_int8   () * 1<<11; | ||||||
|  |  | ||||||
|  |     return score; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score) | ||||||
|  |  | ||||||
|  | #endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) | ||||||
| @@ -128,7 +128,7 @@ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) { | |||||||
| } | } | ||||||
|  |  | ||||||
| static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) { | static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) { | ||||||
| #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) | #if defined(__AVX512VNNI__) | ||||||
|     const __m512i zero = _mm512_setzero_si512(); |     const __m512i zero = _mm512_setzero_si512(); | ||||||
|     return _mm512_dpbusd_epi32(zero, ax, sy); |     return _mm512_dpbusd_epi32(zero, ax, sy); | ||||||
| #else | #else | ||||||
|   | |||||||
							
								
								
									
										12
									
								
								scripts/build-cpu.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										12
									
								
								scripts/build-cpu.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,12 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | name="$1" | ||||||
|  | args="${@:2}" | ||||||
|  |  | ||||||
|  | echo "Building $name with args: $args" | ||||||
|  |  | ||||||
|  | rm -fr build-cpu-$1 | ||||||
|  | cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args | ||||||
|  | cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc) | ||||||
|  | cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so | ||||||
|  | rm -fr build-cpu-$1 | ||||||
		Reference in New Issue
	
	Block a user
	 Diego Devesa
					Diego Devesa