mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	 f4ab2a4147
			
		
	
	f4ab2a4147
	
	
	
		
			
			* merged the changes from deepseeker models to main branch
* Moved regex patterns to unicode.cpp and updated unicode.h
* Moved header files
* Resolved issues
* added and refactored unicode_regex_split and related functions
* Updated/merged the deepseek coder pr
* Refactored code
* Adding unicode regex mappings
* Adding unicode regex function
* Added needed functionality, testing remains
* Fixed issues
* Fixed issue with gpt2 regex custom preprocessor
* unicode : fix? unicode_wstring_to_utf8
* lint : fix whitespaces
* tests : add tokenizer tests for numbers
* unicode : remove redundant headers
* tests : remove and rename tokenizer test scripts
* tests : add sample usage
* gguf-py : reader prints warnings on duplicate keys
* llama : towards llama3 tokenization support (wip)
* unicode : shot in the dark to fix tests on Windows
* unicode : first try custom implementations
* convert : add "tokenizer.ggml.pre" GGUF KV (wip)
* llama : use new pre-tokenizer type
* convert : fix pre-tokenizer type writing
* lint : fix
* make : add test-tokenizer-0-llama-v3
* wip
* models : add llama v3 vocab file
* llama : adapt punctuation regex + add llama 3 regex
* minor
* unicode : set bomb
* unicode : set bomb
* unicode : always use std::wregex
* unicode : support \p{N}, \p{L} and \p{P} natively
* unicode : try fix windows
* unicode : category support via std::regex
* unicode : clean-up
* unicode : simplify
* convert : add convert-hf-to-gguf-update.py
ggml-ci
* lint : update
* convert : add falcon
ggml-ci
* unicode : normalize signatures
* lint : fix
* lint : fix
* convert : remove unused functions
* convert : add comments
* convert : exercise contractions
ggml-ci
* lint : fix
* cmake : refactor test targets
* tests : refactor vocab tests
ggml-ci
* tests : add more vocabs and tests
ggml-ci
* unicode : cleanup
* scripts : ignore new update script in check-requirements.sh
* models : add phi-3, mpt, gpt-2, starcoder
* tests : disable obsolete
ggml-ci
* tests : use faster bpe test
ggml-ci
* llama : more prominent warning for old BPE models
* tests : disable test-tokenizer-1-bpe due to slowness
ggml-ci
---------
Co-authored-by: Jaggzh <jaggz.h@gmail.com>
Co-authored-by: Kazim Abrar Mahi <kazimabrarmahi135@gmail.com>
		
	
		
			
				
	
	
		
			180 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			180 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
| #!/bin/bash
 | |
| set -euo pipefail
 | |
| 
 | |
| #
 | |
| # check-requirements.sh checks all requirements files for each top-level
 | |
| # convert*.py script.
 | |
| #
 | |
| # WARNING: This is quite IO intensive, because a fresh venv is set up for every
 | |
| # python script. As of 2023-12-22, this writes ~2.7GB of data. An adequately
 | |
| # sized tmpfs /tmp or ramdisk is recommended if running this frequently.
 | |
| #
 | |
| # usage:    check-requirements.sh [<working_dir>]
 | |
| #           check-requirements.sh nocleanup [<working_dir>]
 | |
| #
 | |
| # where:
 | |
| #           - <working_dir> is a directory that can be used as the base for
 | |
| #               setting up the venvs. Defaults to `/tmp`.
 | |
| #           - 'nocleanup' as the first argument will disable automatic cleanup
 | |
| #               of the files created by this script.
 | |
| #
 | |
| # requires:
 | |
| #           - bash >= 3.2.57
 | |
| #           - shellcheck
 | |
| #
 | |
| # For each script, it creates a fresh venv, `pip install`s the requirements, and
 | |
| # finally imports the python script to check for `ImportError`.
 | |
| #
 | |
| 
 | |
| log() {
 | |
|     local level=$1 msg=$2
 | |
|     printf >&2 '%s: %s\n' "$level" "$msg"
 | |
| }
 | |
| 
 | |
| debug() {
 | |
|     log DEBUG "$@"
 | |
| }
 | |
| 
 | |
| info() {
 | |
|     log INFO "$@"
 | |
| }
 | |
| 
 | |
| fatal() {
 | |
|     log FATAL "$@"
 | |
|     exit 1
 | |
| }
 | |
| 
 | |
| cleanup() {
 | |
|     if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
 | |
|         info "Removing $workdir"
 | |
|         local count=0
 | |
|         rm -rfv -- "$workdir" | while read -r; do
 | |
|             if (( count++ > 750 )); then
 | |
|                 printf .
 | |
|                 count=0
 | |
|             fi
 | |
|         done
 | |
|         printf '\n'
 | |
|         info "Removed $workdir"
 | |
|     fi
 | |
| }
 | |
| 
 | |
| do_cleanup=1
 | |
| if [[ ${1-} == nocleanup ]]; then
 | |
|     do_cleanup=0; shift
 | |
| fi
 | |
| 
 | |
| if (( do_cleanup )); then
 | |
|     trap exit INT TERM
 | |
|     trap cleanup EXIT
 | |
| fi
 | |
| 
 | |
| this=$(realpath -- "$0"); readonly this
 | |
| cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory
 | |
| 
 | |
| shellcheck "$this"
 | |
| 
 | |
| readonly reqs_dir=requirements
 | |
| 
 | |
| if [[ ${1+x} ]]; then
 | |
|     tmp_dir=$(realpath -- "$1")
 | |
|     if [[ ! ( -d $tmp_dir && -w $tmp_dir ) ]]; then
 | |
|         fatal "$tmp_dir is not a writable directory"
 | |
|     fi
 | |
| else
 | |
|     tmp_dir=/tmp
 | |
| fi
 | |
| 
 | |
| workdir=$(mktemp -d "$tmp_dir/check-requirements.XXXX"); readonly workdir
 | |
| info "Working directory: $workdir"
 | |
| 
 | |
| check_requirements() {
 | |
|     local reqs=$1
 | |
| 
 | |
|     info "$reqs: beginning check"
 | |
|     pip --disable-pip-version-check install -qr "$reqs"
 | |
|     info "$reqs: OK"
 | |
| }
 | |
| 
 | |
| check_convert_script() {
 | |
|     local py=$1             # e.g. ./convert-hf-to-gguf.py
 | |
|     local pyname=${py##*/}  # e.g. convert-hf-to-gguf.py
 | |
|     pyname=${pyname%.py}    # e.g. convert-hf-to-gguf
 | |
| 
 | |
|     info "$py: beginning check"
 | |
| 
 | |
|     local reqs="$reqs_dir/requirements-$pyname.txt"
 | |
|     if [[ ! -r $reqs ]]; then
 | |
|         fatal "$py missing requirements. Expected: $reqs"
 | |
|     fi
 | |
| 
 | |
|     local venv="$workdir/$pyname-venv"
 | |
|     python3 -m venv "$venv"
 | |
| 
 | |
|     (
 | |
|         # shellcheck source=/dev/null
 | |
|         source "$venv/bin/activate"
 | |
| 
 | |
|         check_requirements "$reqs"
 | |
| 
 | |
|         python - "$py" "$pyname" <<'EOF'
 | |
| import sys
 | |
| from importlib.machinery import SourceFileLoader
 | |
| py, pyname = sys.argv[1:]
 | |
| SourceFileLoader(pyname, py).load_module()
 | |
| EOF
 | |
|     )
 | |
| 
 | |
|     if (( do_cleanup )); then
 | |
|         rm -rf -- "$venv"
 | |
|     fi
 | |
| 
 | |
|     info "$py: imports OK"
 | |
| }
 | |
| 
 | |
| readonly ignore_eq_eq='check_requirements: ignore "=="'
 | |
| 
 | |
| for req in "$reqs_dir"/*; do
 | |
|     # Check that all sub-requirements are added to top-level requirements.txt
 | |
|     if ! grep -qF "$req" requirements.txt; then
 | |
|         fatal "$req needs to be added to requirements.txt"
 | |
|     fi
 | |
| 
 | |
|     # Make sure exact release versions aren't being pinned in the requirements
 | |
|     # Filters out the ignore string
 | |
|     if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then
 | |
|         tab=$'\t'
 | |
|         cat >&2 <<EOF
 | |
| FATAL: Avoid pinning exact package versions. Use '~=' instead.
 | |
| You can suppress this error by appending the following to the line:
 | |
| $tab# $ignore_eq_eq
 | |
| EOF
 | |
|         exit 1
 | |
|     fi
 | |
| done
 | |
| 
 | |
| all_venv="$workdir/all-venv"
 | |
| python3 -m venv "$all_venv"
 | |
| 
 | |
| (
 | |
|     # shellcheck source=/dev/null
 | |
|     source "$all_venv/bin/activate"
 | |
|     check_requirements requirements.txt
 | |
| )
 | |
| 
 | |
| if (( do_cleanup )); then
 | |
|     rm -rf -- "$all_venv"
 | |
| fi
 | |
| 
 | |
| check_convert_script convert.py
 | |
| for py in convert-*.py; do
 | |
|     # skip convert-hf-to-gguf-update.py
 | |
|     # TODO: the check is failing for some reason:
 | |
|     #       https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
 | |
|     [[ $py == convert-hf-to-gguf-update.py ]] && continue
 | |
| 
 | |
|     check_convert_script "$py"
 | |
| done
 | |
| 
 | |
| info 'Done! No issues found.'
 |