mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
* model: add support for extra bufs for all devices * hexagon: add experimental ggml-hexagon backend for the Hexagon NPU This commit introduces a new experimental backend `ggml-hexagon` with support for the Hexagon NPU. Highlights: - Supports Hexagon versions: v73, v75, v79, and v81 - Targets Android devices based on Snapdragon SoCs: Gen3, 8-Elite, and 8-Elite Gen5 - Supports Q4_0, Q8_0, MXFP4, and FP32 data types - Implements core LLM ops: MUL_MAT/MUL_MAT_ID, ADD/SUB/MUL/ADD_ID, RMS_NORM, ROPE, GLU/SWIGLU, SOFTMAX **Note:** This backend is experimental and may exhibit instability or limited performance across supported devices. It is intended for early testing and feedback from llama.cpp/ggml developer and user community. Co-Authored-By: Rajdeep Ganguly <rganguly@qti.qualcomm.com> Co-Authored-By: Todor Boinovski <todorb@qti.qualcomm.com> * hexagon: fix format checker errors * hexagon: update readme and cmake presets * ci: add android-ndk-build jobs that build plain ARM64 and Snapdragon versions * hexagon: add simple graph optimizer for stacking MUL_MAT ops with the same input * hexagon: move ADB helper scripts into scripts/snapdragon/adb * hexagon: replace all f/printfs with GGML_LOG_... * readme: add hexagon to the list supported backends * hexagon: stack malmuts with quantized inputs only * hexagon: add TODO for fixing issues in hexagon_graph_optimize * hexagon: update to hex-sdk 6.4.0 and add scripts for running on QDC * scripts: fix lint errors * scripts: update qdc pytest script to make linter happy * hexagon: add reduce sum in fp32 * hexagon: reduce number of vector stores in matmul output * hexagon: remove the need for vdelta in reduce-multiply-x8 * hexagon: consistent use of reduce_sum_fp32 for row_sums * hexagon: some more matmul optimizations and comments Optimize cases where tensor dims are not multiple of 1024 (e.g in Qwen models). We've handled those cases already but at a higher overhead. * hexagon: update cmake presets * hexagon: add OPMASK support for run-bench.sh wrapper * hexagon: update to use GGML_BACKEND_API * hexagon: remove unused logic for setting tensor flags for the views * hexagon: add asserts to set/get_tensor to make sure we handle complete tensors Same asserts as the CPU backend. * hexagon: use cpy_tensor slow path for non-host buffers * hexagon: error checks in the buffer allocator * cmake: move include(extProj) under ggml-hexagon * hexagon: don't forget to delete the backend on free * hexagon: set/get_tensor size assert apply only to quantized tensors * hexagon: reintroduce HEX_VERBOSE wrapper for GGML_LOG_DEBUG for now GGML_LOG_DEBUG is always enabled for test-backend-ops and the output gets in the way. Ideally we need a bit more finer log levels. * docs: typos in hexagon developer docs (libggm-...) * hexagon: overhaul error handling in the session/device allocation this should handle all failure paths in the session allocation. * hexagon: update cmake presets to enable fp16 vectors * hexagon: remove unused time_usec function * hexagon: don't forget to release buffer contexts * hexagon: fixed indents in hvx-utils (missed clang-format auto-format failure) * hexagon: remove custom can_repeat function and use ggml_can_repeat --------- Co-authored-by: Rajdeep Ganguly <rganguly@qti.qualcomm.com> Co-authored-by: Todor Boinovski <todorb@qti.qualcomm.com>
50 lines
2.2 KiB
JSON
50 lines
2.2 KiB
JSON
{
|
|
"version": 4,
|
|
"configurePresets": [
|
|
{
|
|
"name": "arm64-android-snapdragon",
|
|
"hidden": true,
|
|
"architecture": { "value": "arm64", "strategy": "external" },
|
|
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
|
"cacheVariables": {
|
|
"ANDROID_ABI": "arm64-v8a",
|
|
"ANDROID_PLATFORM": "android-31",
|
|
"CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
|
|
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
|
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
|
|
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
|
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
|
|
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
|
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
|
|
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
|
"PREBUILT_LIB_DIR": "android_aarch64",
|
|
"GGML_OPENMP": "OFF",
|
|
"GGML_LLAMAFILE": "OFF",
|
|
"GGML_OPENCL": "ON",
|
|
"GGML_HEXAGON": "ON",
|
|
"LLAMA_CURL": "OFF"
|
|
}
|
|
},
|
|
|
|
{
|
|
"name": "arm64-windows-snapdragon",
|
|
"inherits": [ "base", "arm64-windows-llvm" ],
|
|
"cacheVariables": {
|
|
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
|
|
"PREBUILT_LIB_DIR": "windows_aarch64",
|
|
"GGML_OPENMP": "OFF",
|
|
"GGML_LLAMAFILE": "OFF",
|
|
"GGML_OPENCL": "ON",
|
|
"GGML_HEXAGON": "ON",
|
|
"LLAMA_CURL": "OFF"
|
|
}
|
|
},
|
|
|
|
{ "name": "arm64-android-snapdragon-debug" , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
|
|
{ "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
|
|
|
|
{ "name": "arm64-windows-snapdragon-debug" , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
|
|
{ "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
|
|
]
|
|
}
|