mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	* merged the changes from deepseeker models to main branch
* Moved regex patterns to unicode.cpp and updated unicode.h
* Moved header files
* Resolved issues
* added and refactored unicode_regex_split and related functions
* Updated/merged the deepseek coder pr
* Refactored code
* Adding unicode regex mappings
* Adding unicode regex function
* Added needed functionality, testing remains
* Fixed issues
* Fixed issue with gpt2 regex custom preprocessor
* unicode : fix? unicode_wstring_to_utf8
* lint : fix whitespaces
* tests : add tokenizer tests for numbers
* unicode : remove redundant headers
* tests : remove and rename tokenizer test scripts
* tests : add sample usage
* gguf-py : reader prints warnings on duplicate keys
* llama : towards llama3 tokenization support (wip)
* unicode : shot in the dark to fix tests on Windows
* unicode : first try custom implementations
* convert : add "tokenizer.ggml.pre" GGUF KV (wip)
* llama : use new pre-tokenizer type
* convert : fix pre-tokenizer type writing
* lint : fix
* make : add test-tokenizer-0-llama-v3
* wip
* models : add llama v3 vocab file
* llama : adapt punctuation regex + add llama 3 regex
* minor
* unicode : set bomb
* unicode : set bomb
* unicode : always use std::wregex
* unicode : support \p{N}, \p{L} and \p{P} natively
* unicode : try fix windows
* unicode : category support via std::regex
* unicode : clean-up
* unicode : simplify
* convert : add convert-hf-to-gguf-update.py
ggml-ci
* lint : update
* convert : add falcon
ggml-ci
* unicode : normalize signatures
* lint : fix
* lint : fix
* convert : remove unused functions
* convert : add comments
* convert : exercise contractions
ggml-ci
* lint : fix
* cmake : refactor test targets
* tests : refactor vocab tests
ggml-ci
* tests : add more vocabs and tests
ggml-ci
* unicode : cleanup
* scripts : ignore new update script in check-requirements.sh
* models : add phi-3, mpt, gpt-2, starcoder
* tests : disable obsolete
ggml-ci
* tests : use faster bpe test
ggml-ci
* llama : more prominent warning for old BPE models
* tests : disable test-tokenizer-1-bpe due to slowness
ggml-ci
---------
Co-authored-by: Jaggzh <jaggz.h@gmail.com>
Co-authored-by: Kazim Abrar Mahi <kazimabrarmahi135@gmail.com>
		
	
		
			
				
	
	
		
			180 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			180 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
#!/bin/bash
 | 
						|
set -euo pipefail
 | 
						|
 | 
						|
#
 | 
						|
# check-requirements.sh checks all requirements files for each top-level
 | 
						|
# convert*.py script.
 | 
						|
#
 | 
						|
# WARNING: This is quite IO intensive, because a fresh venv is set up for every
 | 
						|
# python script. As of 2023-12-22, this writes ~2.7GB of data. An adequately
 | 
						|
# sized tmpfs /tmp or ramdisk is recommended if running this frequently.
 | 
						|
#
 | 
						|
# usage:    check-requirements.sh [<working_dir>]
 | 
						|
#           check-requirements.sh nocleanup [<working_dir>]
 | 
						|
#
 | 
						|
# where:
 | 
						|
#           - <working_dir> is a directory that can be used as the base for
 | 
						|
#               setting up the venvs. Defaults to `/tmp`.
 | 
						|
#           - 'nocleanup' as the first argument will disable automatic cleanup
 | 
						|
#               of the files created by this script.
 | 
						|
#
 | 
						|
# requires:
 | 
						|
#           - bash >= 3.2.57
 | 
						|
#           - shellcheck
 | 
						|
#
 | 
						|
# For each script, it creates a fresh venv, `pip install`s the requirements, and
 | 
						|
# finally imports the python script to check for `ImportError`.
 | 
						|
#
 | 
						|
 | 
						|
log() {
 | 
						|
    local level=$1 msg=$2
 | 
						|
    printf >&2 '%s: %s\n' "$level" "$msg"
 | 
						|
}
 | 
						|
 | 
						|
debug() {
 | 
						|
    log DEBUG "$@"
 | 
						|
}
 | 
						|
 | 
						|
info() {
 | 
						|
    log INFO "$@"
 | 
						|
}
 | 
						|
 | 
						|
fatal() {
 | 
						|
    log FATAL "$@"
 | 
						|
    exit 1
 | 
						|
}
 | 
						|
 | 
						|
cleanup() {
 | 
						|
    if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
 | 
						|
        info "Removing $workdir"
 | 
						|
        local count=0
 | 
						|
        rm -rfv -- "$workdir" | while read -r; do
 | 
						|
            if (( count++ > 750 )); then
 | 
						|
                printf .
 | 
						|
                count=0
 | 
						|
            fi
 | 
						|
        done
 | 
						|
        printf '\n'
 | 
						|
        info "Removed $workdir"
 | 
						|
    fi
 | 
						|
}
 | 
						|
 | 
						|
do_cleanup=1
 | 
						|
if [[ ${1-} == nocleanup ]]; then
 | 
						|
    do_cleanup=0; shift
 | 
						|
fi
 | 
						|
 | 
						|
if (( do_cleanup )); then
 | 
						|
    trap exit INT TERM
 | 
						|
    trap cleanup EXIT
 | 
						|
fi
 | 
						|
 | 
						|
this=$(realpath -- "$0"); readonly this
 | 
						|
cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory
 | 
						|
 | 
						|
shellcheck "$this"
 | 
						|
 | 
						|
readonly reqs_dir=requirements
 | 
						|
 | 
						|
if [[ ${1+x} ]]; then
 | 
						|
    tmp_dir=$(realpath -- "$1")
 | 
						|
    if [[ ! ( -d $tmp_dir && -w $tmp_dir ) ]]; then
 | 
						|
        fatal "$tmp_dir is not a writable directory"
 | 
						|
    fi
 | 
						|
else
 | 
						|
    tmp_dir=/tmp
 | 
						|
fi
 | 
						|
 | 
						|
workdir=$(mktemp -d "$tmp_dir/check-requirements.XXXX"); readonly workdir
 | 
						|
info "Working directory: $workdir"
 | 
						|
 | 
						|
check_requirements() {
 | 
						|
    local reqs=$1
 | 
						|
 | 
						|
    info "$reqs: beginning check"
 | 
						|
    pip --disable-pip-version-check install -qr "$reqs"
 | 
						|
    info "$reqs: OK"
 | 
						|
}
 | 
						|
 | 
						|
check_convert_script() {
 | 
						|
    local py=$1             # e.g. ./convert-hf-to-gguf.py
 | 
						|
    local pyname=${py##*/}  # e.g. convert-hf-to-gguf.py
 | 
						|
    pyname=${pyname%.py}    # e.g. convert-hf-to-gguf
 | 
						|
 | 
						|
    info "$py: beginning check"
 | 
						|
 | 
						|
    local reqs="$reqs_dir/requirements-$pyname.txt"
 | 
						|
    if [[ ! -r $reqs ]]; then
 | 
						|
        fatal "$py missing requirements. Expected: $reqs"
 | 
						|
    fi
 | 
						|
 | 
						|
    local venv="$workdir/$pyname-venv"
 | 
						|
    python3 -m venv "$venv"
 | 
						|
 | 
						|
    (
 | 
						|
        # shellcheck source=/dev/null
 | 
						|
        source "$venv/bin/activate"
 | 
						|
 | 
						|
        check_requirements "$reqs"
 | 
						|
 | 
						|
        python - "$py" "$pyname" <<'EOF'
 | 
						|
import sys
 | 
						|
from importlib.machinery import SourceFileLoader
 | 
						|
py, pyname = sys.argv[1:]
 | 
						|
SourceFileLoader(pyname, py).load_module()
 | 
						|
EOF
 | 
						|
    )
 | 
						|
 | 
						|
    if (( do_cleanup )); then
 | 
						|
        rm -rf -- "$venv"
 | 
						|
    fi
 | 
						|
 | 
						|
    info "$py: imports OK"
 | 
						|
}
 | 
						|
 | 
						|
readonly ignore_eq_eq='check_requirements: ignore "=="'
 | 
						|
 | 
						|
for req in "$reqs_dir"/*; do
 | 
						|
    # Check that all sub-requirements are added to top-level requirements.txt
 | 
						|
    if ! grep -qF "$req" requirements.txt; then
 | 
						|
        fatal "$req needs to be added to requirements.txt"
 | 
						|
    fi
 | 
						|
 | 
						|
    # Make sure exact release versions aren't being pinned in the requirements
 | 
						|
    # Filters out the ignore string
 | 
						|
    if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then
 | 
						|
        tab=$'\t'
 | 
						|
        cat >&2 <<EOF
 | 
						|
FATAL: Avoid pinning exact package versions. Use '~=' instead.
 | 
						|
You can suppress this error by appending the following to the line:
 | 
						|
$tab# $ignore_eq_eq
 | 
						|
EOF
 | 
						|
        exit 1
 | 
						|
    fi
 | 
						|
done
 | 
						|
 | 
						|
all_venv="$workdir/all-venv"
 | 
						|
python3 -m venv "$all_venv"
 | 
						|
 | 
						|
(
 | 
						|
    # shellcheck source=/dev/null
 | 
						|
    source "$all_venv/bin/activate"
 | 
						|
    check_requirements requirements.txt
 | 
						|
)
 | 
						|
 | 
						|
if (( do_cleanup )); then
 | 
						|
    rm -rf -- "$all_venv"
 | 
						|
fi
 | 
						|
 | 
						|
check_convert_script convert.py
 | 
						|
for py in convert-*.py; do
 | 
						|
    # skip convert-hf-to-gguf-update.py
 | 
						|
    # TODO: the check is failing for some reason:
 | 
						|
    #       https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
 | 
						|
    [[ $py == convert-hf-to-gguf-update.py ]] && continue
 | 
						|
 | 
						|
    check_convert_script "$py"
 | 
						|
done
 | 
						|
 | 
						|
info 'Done! No issues found.'
 |