mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	bert : add tests + fix quantization (#5475)
* llama : do not quantize pos embd and token type tensors * ci : add BERT tests ggml-ci * ci : do not do BERT tests on low-perf nodes ggml-ci
This commit is contained in:
		
							
								
								
									
										46
									
								
								ci/run.sh
									
									
									
									
									
								
							
							
						
						
									
										46
									
								
								ci/run.sh
									
									
									
									
									
								
							| @@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 { | |||||||
|     #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" |     #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" | ||||||
| } | } | ||||||
|  |  | ||||||
|  | # bge-small | ||||||
|  |  | ||||||
|  | function gg_run_embd_bge_small { | ||||||
|  |     cd ${SRC} | ||||||
|  |  | ||||||
|  |     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json | ||||||
|  |     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model | ||||||
|  |     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json | ||||||
|  |     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json | ||||||
|  |     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin | ||||||
|  |     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json | ||||||
|  |     gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt | ||||||
|  |  | ||||||
|  |     path_models="../models-mnt/bge-small" | ||||||
|  |  | ||||||
|  |     rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release | ||||||
|  |  | ||||||
|  |     set -e | ||||||
|  |  | ||||||
|  |     (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log | ||||||
|  |     (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log | ||||||
|  |  | ||||||
|  |     python3 ../convert-hf-to-gguf.py ${path_models} | ||||||
|  |  | ||||||
|  |     model_f16="${path_models}/ggml-model-f16.gguf" | ||||||
|  |     model_q8_0="${path_models}/ggml-model-q8_0.gguf" | ||||||
|  |  | ||||||
|  |     ./bin/quantize ${model_f16} ${model_q8_0} q8_0 | ||||||
|  |  | ||||||
|  |     (time ./bin/embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log | ||||||
|  |     (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log | ||||||
|  |  | ||||||
|  |     set +e | ||||||
|  | } | ||||||
|  |  | ||||||
|  | function gg_sum_embd_bge_small { | ||||||
|  |     gg_printf '### %s\n\n' "${ci}" | ||||||
|  |  | ||||||
|  |     gg_printf 'BGE Small (BERT):\n' | ||||||
|  |     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" | ||||||
|  |     gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" | ||||||
|  |     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" | ||||||
|  | } | ||||||
|  |  | ||||||
| ## main | ## main | ||||||
|  |  | ||||||
| if [ -z ${GG_BUILD_LOW_PERF} ]; then | if [ -z ${GG_BUILD_LOW_PERF} ]; then | ||||||
| @@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug | |||||||
| test $ret -eq 0 && gg_run ctest_release | test $ret -eq 0 && gg_run ctest_release | ||||||
|  |  | ||||||
| if [ -z ${GG_BUILD_LOW_PERF} ]; then | if [ -z ${GG_BUILD_LOW_PERF} ]; then | ||||||
|  |     test $ret -eq 0 && gg_run embd_bge_small | ||||||
|  |  | ||||||
|     if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then |     if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then | ||||||
|         if [ -z ${GG_BUILD_CUDA} ]; then |         if [ -z ${GG_BUILD_CUDA} ]; then | ||||||
|             test $ret -eq 0 && gg_run open_llama_3b_v2 |             test $ret -eq 0 && gg_run open_llama_3b_v2 | ||||||
|   | |||||||
| @@ -10444,7 +10444,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|         quantize &= !params->only_copy; |         quantize &= !params->only_copy; | ||||||
|  |  | ||||||
|         // do not quantize expert gating tensors |         // do not quantize expert gating tensors | ||||||
|         quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; |         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight"); | ||||||
|  |  | ||||||
|  |         // do not quantize positional embeddings and token types (BERT) | ||||||
|  |         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight"); | ||||||
|  |         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); | ||||||
|  |  | ||||||
|         enum ggml_type new_type; |         enum ggml_type new_type; | ||||||
|         void * new_data; |         void * new_data; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov