mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	 63d2fc46e1
			
		
	
	63d2fc46e1
	
	
	
		
			
			* model: add support for extra bufs for all devices * hexagon: add experimental ggml-hexagon backend for the Hexagon NPU This commit introduces a new experimental backend `ggml-hexagon` with support for the Hexagon NPU. Highlights: - Supports Hexagon versions: v73, v75, v79, and v81 - Targets Android devices based on Snapdragon SoCs: Gen3, 8-Elite, and 8-Elite Gen5 - Supports Q4_0, Q8_0, MXFP4, and FP32 data types - Implements core LLM ops: MUL_MAT/MUL_MAT_ID, ADD/SUB/MUL/ADD_ID, RMS_NORM, ROPE, GLU/SWIGLU, SOFTMAX **Note:** This backend is experimental and may exhibit instability or limited performance across supported devices. It is intended for early testing and feedback from llama.cpp/ggml developer and user community. Co-Authored-By: Rajdeep Ganguly <rganguly@qti.qualcomm.com> Co-Authored-By: Todor Boinovski <todorb@qti.qualcomm.com> * hexagon: fix format checker errors * hexagon: update readme and cmake presets * ci: add android-ndk-build jobs that build plain ARM64 and Snapdragon versions * hexagon: add simple graph optimizer for stacking MUL_MAT ops with the same input * hexagon: move ADB helper scripts into scripts/snapdragon/adb * hexagon: replace all f/printfs with GGML_LOG_... * readme: add hexagon to the list supported backends * hexagon: stack malmuts with quantized inputs only * hexagon: add TODO for fixing issues in hexagon_graph_optimize * hexagon: update to hex-sdk 6.4.0 and add scripts for running on QDC * scripts: fix lint errors * scripts: update qdc pytest script to make linter happy * hexagon: add reduce sum in fp32 * hexagon: reduce number of vector stores in matmul output * hexagon: remove the need for vdelta in reduce-multiply-x8 * hexagon: consistent use of reduce_sum_fp32 for row_sums * hexagon: some more matmul optimizations and comments Optimize cases where tensor dims are not multiple of 1024 (e.g in Qwen models). We've handled those cases already but at a higher overhead. * hexagon: update cmake presets * hexagon: add OPMASK support for run-bench.sh wrapper * hexagon: update to use GGML_BACKEND_API * hexagon: remove unused logic for setting tensor flags for the views * hexagon: add asserts to set/get_tensor to make sure we handle complete tensors Same asserts as the CPU backend. * hexagon: use cpy_tensor slow path for non-host buffers * hexagon: error checks in the buffer allocator * cmake: move include(extProj) under ggml-hexagon * hexagon: don't forget to delete the backend on free * hexagon: set/get_tensor size assert apply only to quantized tensors * hexagon: reintroduce HEX_VERBOSE wrapper for GGML_LOG_DEBUG for now GGML_LOG_DEBUG is always enabled for test-backend-ops and the output gets in the way. Ideally we need a bit more finer log levels. * docs: typos in hexagon developer docs (libggm-...) * hexagon: overhaul error handling in the session/device allocation this should handle all failure paths in the session allocation. * hexagon: update cmake presets to enable fp16 vectors * hexagon: remove unused time_usec function * hexagon: don't forget to release buffer contexts * hexagon: fixed indents in hvx-utils (missed clang-format auto-format failure) * hexagon: remove custom can_repeat function and use ggml_can_repeat --------- Co-authored-by: Rajdeep Ganguly <rganguly@qti.qualcomm.com> Co-authored-by: Todor Boinovski <todorb@qti.qualcomm.com>
		
			
				
	
	
		
			64 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			64 lines
		
	
	
		
			1.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| import subprocess
 | |
| import sys
 | |
| 
 | |
| tmp_path='/data/local/tmp'
 | |
| pkg_path=f'{tmp_path}/llama.cpp'
 | |
| lib_path=f'{pkg_path}/lib'
 | |
| bin_path=f'{pkg_path}/bin'
 | |
| 
 | |
| model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
 | |
| cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
 | |
| 
 | |
| 
 | |
| def run_cmd(cmd):
 | |
|     p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
 | |
|     sys.stdout.write(p.stdout)
 | |
|     assert(p.returncode == 0)
 | |
| 
 | |
| 
 | |
| @pytest.mark.dependency()
 | |
| def test_install():
 | |
|     run_cmd(['adb', 'push', 'llama.cpp', f'{tmp_path}'])
 | |
|     run_cmd(['adb', 'shell', f'chmod 755 {bin_path}/*'])
 | |
| 
 | |
| 
 | |
| ## Basic cli tests
 | |
| def run_llama_cli(dev, opts):
 | |
|     prompt='what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**.'
 | |
|     opts = '--batch-size 128 -n 128 -no-cnv --seed 42 ' + opts
 | |
|     run_cmd(['adb', 'shell', f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 -t 4 {opts} -p "{prompt}"'])
 | |
| 
 | |
| 
 | |
| @pytest.mark.dependency(depends=['test_install'])
 | |
| def test_llama_cli_cpu():
 | |
|     run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
 | |
| 
 | |
| 
 | |
| @pytest.mark.dependency(depends=['test_install'])
 | |
| def test_llama_cli_gpu():
 | |
|     run_llama_cli('GPUOpenCL', '-fa on')
 | |
| 
 | |
| 
 | |
| @pytest.mark.dependency(depends=['test_install'])
 | |
| def test_llama_cli_npu():
 | |
|     run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
 | |
| 
 | |
| 
 | |
| ## Basic bench tests
 | |
| def run_llama_bench(dev):
 | |
|     run_cmd(['adb', 'shell', f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'])
 | |
| 
 | |
| 
 | |
| @pytest.mark.dependency(depends=['test_install'])
 | |
| def test_llama_bench_cpu():
 | |
|     run_llama_bench('none')
 | |
| 
 | |
| 
 | |
| def test_llama_bench_gpu():
 | |
|     run_llama_bench('GPUOpenCL')
 | |
| 
 | |
| 
 | |
| def test_llama_bench_npu():
 | |
|     run_llama_bench('HTP0')
 |