mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
* devops: move s390x and ppc64le ci build we have access to ubuntu-24.04-s390x and ppc64le images now Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: disable ppc64le for now since they have compiler errors Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: stop warnings as errors Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: switch to non-macro flag Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: going the llama macro route Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: add big-endian gguf test models Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: disable ppc64le to test s390x, check test build Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: dup .gguf.inp files for big-endian tests Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: dup .gguf.out files for big-endian too Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: add python setup and endian byteswap Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: pooring thing does not have s390x python3 Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: add missing rust compiler for s390x Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: try rust actions runner Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Revert "devops: try rust actions runner" This reverts commit 3f8db04356033d6c1d7eccc75ca396bc5298250c. Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: try a different path for rust Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: dump home directory and user info Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: install gguf-py only Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: missed relative path Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: remove big-endian files since local swapping is working Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: revert test-tokenizer-0 cmakelists Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Fix unicode flags conversion from and to uint16_t Bitfields are allocated in different order on s390x Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Simplify byteswap command Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Add byteswapping and git-lfs for test-tokenizers-ggml-vocabs Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Fix endianness detection in vocab loader Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Disable test-thread-safety on s390x In this test a model is downloaded, then immediately loaded to check if more downloads are needed, and then used for test. There is no clean way to separate all those steps to add byteswapping between them, so just skip this test. Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Fix q8_0 test in test-quantize-fns vec_signed uses unexpected rounding mode. Explicitly use different rounding function. Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: add big-endian stories260K Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: add s390x test-eval-callback Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: fix test does not exist Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: fix model not found llama-eval-callback Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Fix q3_K dot product error in test-quantize-fns on s390x Array q8bytes had only 4 elements allocated, but 8 elements accessed. This lead to write out of bounds and later read of overwritten values out of bounds and incorrect result. Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: re-enable ppc64le for testing Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: activate test-thread-safety for s390x Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: disable ppc64le tests for some reason it keeps failing test-thread-safety tests and I do not have a machine that is able to replicate the tests. Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * devops: LLAMA_FATAL_WARNINGS=ON Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Correct repository URL for s390x for test-thread-safety model Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Fix fs_get_cache_directory Ensure it works even if both XDG_CACHE_HOME and HOME are unset. This might happen in containers. Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Re-enable CI for ppc64le Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Fortify ggml_rope_impl Only memcpy data from sections argument if it's non-NULL. Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> * Add TODO in struct unicode_cpt_flags to reimplement it in endian-independent way * Update URL for big-endian model * Update .github/workflows/build.yml Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update remaining mentions of BE models to ggml-org/models repo --------- Signed-off-by: Aaron Teo <aaron.teo1@ibm.com> Co-authored-by: Aleksei Nikiforov <aleksei.nikiforov@linux.ibm.com> Co-authored-by: Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
112 lines
3.8 KiB
C++
112 lines
3.8 KiB
C++
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
// TODO: reimplement this structure in endian-independent way
|
|
struct unicode_cpt_flags {
|
|
enum {
|
|
UNDEFINED = 0x0001,
|
|
NUMBER = 0x0002, // regex: \p{N}
|
|
LETTER = 0x0004, // regex: \p{L}
|
|
SEPARATOR = 0x0008, // regex: \p{Z}
|
|
ACCENT_MARK = 0x0010, // regex: \p{M}
|
|
PUNCTUATION = 0x0020, // regex: \p{P}
|
|
SYMBOL = 0x0040, // regex: \p{S}
|
|
CONTROL = 0x0080, // regex: \p{C}
|
|
MASK_CATEGORIES = 0x00FF,
|
|
WHITESPACE = 0x0100,
|
|
LOWERCASE = 0x0200,
|
|
UPPERCASE = 0x0400,
|
|
NFD = 0x0800,
|
|
};
|
|
|
|
// codepoint type
|
|
uint16_t is_undefined : 1;
|
|
uint16_t is_number : 1; // regex: \p{N}
|
|
uint16_t is_letter : 1; // regex: \p{L}
|
|
uint16_t is_separator : 1; // regex: \p{Z}
|
|
uint16_t is_accent_mark : 1; // regex: \p{M}
|
|
uint16_t is_punctuation : 1; // regex: \p{P}
|
|
uint16_t is_symbol : 1; // regex: \p{S}
|
|
uint16_t is_control : 1; // regex: \p{C}
|
|
// helper flags
|
|
uint16_t is_whitespace : 1; // regex: \s
|
|
uint16_t is_lowercase : 1;
|
|
uint16_t is_uppercase : 1;
|
|
uint16_t is_nfd : 1;
|
|
|
|
// decode from uint16
|
|
inline unicode_cpt_flags(const uint16_t flags = 0) {
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
*reinterpret_cast<uint16_t*>(this) = flags;
|
|
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
is_undefined = (flags & UNDEFINED) ? 1 : 0;
|
|
is_number = (flags & NUMBER) ? 1 : 0;
|
|
is_letter = (flags & LETTER) ? 1 : 0;
|
|
is_separator = (flags & SEPARATOR) ? 1 : 0;
|
|
is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
|
|
is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
|
|
is_symbol = (flags & SYMBOL) ? 1 : 0;
|
|
is_control = (flags & CONTROL) ? 1 : 0;
|
|
is_whitespace = (flags & WHITESPACE) ? 1 : 0;
|
|
is_lowercase = (flags & LOWERCASE) ? 1 : 0;
|
|
is_uppercase = (flags & UPPERCASE) ? 1 : 0;
|
|
is_nfd = (flags & NFD) ? 1 : 0;
|
|
#else
|
|
#error Unexpected or undefined __BYTE_ORDER__
|
|
#endif
|
|
}
|
|
|
|
inline uint16_t as_uint() const {
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
return *reinterpret_cast<const uint16_t*>(this);
|
|
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
uint16_t result =
|
|
is_undefined * UNDEFINED
|
|
+ is_number * NUMBER
|
|
+ is_letter * LETTER
|
|
+ is_separator * SEPARATOR
|
|
+ is_accent_mark * ACCENT_MARK
|
|
+ is_punctuation * PUNCTUATION
|
|
+ is_symbol * SYMBOL
|
|
+ is_control * CONTROL
|
|
+ is_whitespace * WHITESPACE
|
|
+ is_lowercase * LOWERCASE
|
|
+ is_uppercase * UPPERCASE
|
|
+ is_nfd * NFD
|
|
;
|
|
|
|
return result;
|
|
#else
|
|
#error Unexpected or undefined __BYTE_ORDER__
|
|
#endif
|
|
}
|
|
|
|
inline uint16_t category_flag() const {
|
|
return this->as_uint() & MASK_CATEGORIES;
|
|
}
|
|
};
|
|
|
|
size_t unicode_len_utf8(char src);
|
|
|
|
std::string unicode_cpt_to_utf8 (uint32_t cpt);
|
|
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
|
|
|
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
|
|
|
|
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
|
|
|
|
unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
|
|
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
|
|
|
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
|
|
|
uint32_t unicode_tolower(uint32_t cpt);
|
|
|
|
bool unicode_cpt_is_han(uint32_t cpt);
|
|
|
|
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|