mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	check C++ code with -Wmissing-declarations (#3184)
This commit is contained in:
		| @@ -427,6 +427,7 @@ if (LLAMA_ALL_WARNINGS) | ||||
|             -Wextra | ||||
|             -Wpedantic | ||||
|             -Wcast-qual | ||||
|             -Wmissing-declarations | ||||
|             -Wno-unused-function | ||||
|             -Wno-multichar | ||||
|         ) | ||||
|   | ||||
							
								
								
									
										13
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										13
									
								
								Makefile
									
									
									
									
									
								
							| @@ -172,9 +172,16 @@ endif # LLAMA_DISABLE_LOGS | ||||
| # warnings | ||||
| MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ | ||||
| 				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function | ||||
| MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar | ||||
| MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar | ||||
|  | ||||
| ifeq '' '$(findstring clang,$(shell $(CXX) --version))' | ||||
| # TODO(cebtenzzre): remove this once PR #2632 gets merged | ||||
| TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations | ||||
|  | ||||
| ifneq '' '$(findstring clang,$(shell $(CXX) --version))' | ||||
| 	# clang++ only | ||||
| 	MK_CXXFLAGS   += -Wmissing-prototypes | ||||
| 	TTFS_CXXFLAGS += -Wno-missing-prototypes | ||||
| else | ||||
| 	# g++ only | ||||
| 	MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds | ||||
| endif | ||||
| @@ -524,7 +531,7 @@ gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) | ||||
| 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) | ||||
|  | ||||
| train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS) | ||||
| 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) | ||||
| 	$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) | ||||
|  | ||||
| convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) | ||||
| 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) | ||||
|   | ||||
| @@ -78,7 +78,7 @@ int32_t get_num_physical_cores() { | ||||
|     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; | ||||
| } | ||||
|  | ||||
| void process_escapes(std::string& input) { | ||||
| static void process_escapes(std::string& input) { | ||||
|     std::size_t input_len = input.length(); | ||||
|     std::size_t output_idx = 0; | ||||
|  | ||||
|   | ||||
| @@ -158,7 +158,7 @@ namespace console { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     char32_t getchar32() { | ||||
|     static char32_t getchar32() { | ||||
| #if defined(_WIN32) | ||||
|         HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); | ||||
|         wchar_t high_surrogate = 0; | ||||
| @@ -212,7 +212,7 @@ namespace console { | ||||
| #endif | ||||
|     } | ||||
|  | ||||
|     void pop_cursor() { | ||||
|     static void pop_cursor() { | ||||
| #if defined(_WIN32) | ||||
|         if (hConsole != NULL) { | ||||
|             CONSOLE_SCREEN_BUFFER_INFO bufferInfo; | ||||
| @@ -233,7 +233,7 @@ namespace console { | ||||
|         putc('\b', out); | ||||
|     } | ||||
|  | ||||
|     int estimateWidth(char32_t codepoint) { | ||||
|     static int estimateWidth(char32_t codepoint) { | ||||
| #if defined(_WIN32) | ||||
|         (void)codepoint; | ||||
|         return 1; | ||||
| @@ -242,7 +242,7 @@ namespace console { | ||||
| #endif | ||||
|     } | ||||
|  | ||||
|     int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { | ||||
|     static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) { | ||||
| #if defined(_WIN32) | ||||
|         CONSOLE_SCREEN_BUFFER_INFO bufferInfo; | ||||
|         if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) { | ||||
| @@ -303,7 +303,7 @@ namespace console { | ||||
| #endif | ||||
|     } | ||||
|  | ||||
|     void replace_last(char ch) { | ||||
|     static void replace_last(char ch) { | ||||
| #if defined(_WIN32) | ||||
|         pop_cursor(); | ||||
|         put_codepoint(&ch, 1, 1); | ||||
| @@ -312,7 +312,7 @@ namespace console { | ||||
| #endif | ||||
|     } | ||||
|  | ||||
|     void append_utf8(char32_t ch, std::string & out) { | ||||
|     static void append_utf8(char32_t ch, std::string & out) { | ||||
|         if (ch <= 0x7F) { | ||||
|             out.push_back(static_cast<unsigned char>(ch)); | ||||
|         } else if (ch <= 0x7FF) { | ||||
| @@ -333,7 +333,7 @@ namespace console { | ||||
|     } | ||||
|  | ||||
|     // Helper function to remove the last UTF-8 character from a string | ||||
|     void pop_back_utf8_char(std::string & line) { | ||||
|     static void pop_back_utf8_char(std::string & line) { | ||||
|         if (line.empty()) { | ||||
|             return; | ||||
|         } | ||||
| @@ -349,7 +349,7 @@ namespace console { | ||||
|         line.erase(pos); | ||||
|     } | ||||
|  | ||||
|     bool readline_advanced(std::string & line, bool multiline_input) { | ||||
|     static bool readline_advanced(std::string & line, bool multiline_input) { | ||||
|         if (out != stdout) { | ||||
|             fflush(stdout); | ||||
|         } | ||||
| @@ -452,7 +452,7 @@ namespace console { | ||||
|         return has_more; | ||||
|     } | ||||
|  | ||||
|     bool readline_simple(std::string & line, bool multiline_input) { | ||||
|     static bool readline_simple(std::string & line, bool multiline_input) { | ||||
| #if defined(_WIN32) | ||||
|         std::wstring wline; | ||||
|         if (!std::getline(std::wcin, wline)) { | ||||
|   | ||||
| @@ -9,7 +9,7 @@ | ||||
| namespace grammar_parser { | ||||
|     // NOTE: assumes valid utf8 (but checks for overrun) | ||||
|     // copied from llama.cpp | ||||
|     std::pair<uint32_t, const char *> decode_utf8(const char * src) { | ||||
|     static std::pair<uint32_t, const char *> decode_utf8(const char * src) { | ||||
|         static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; | ||||
|         uint8_t  first_byte = static_cast<uint8_t>(*src); | ||||
|         uint8_t  highbits   = first_byte >> 4; | ||||
| @@ -24,19 +24,19 @@ namespace grammar_parser { | ||||
|         return std::make_pair(value, pos); | ||||
|     } | ||||
|  | ||||
|     uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { | ||||
|     static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { | ||||
|         uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); | ||||
|         auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); | ||||
|         return result.first->second; | ||||
|     } | ||||
|  | ||||
|     uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { | ||||
|     static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { | ||||
|         uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); | ||||
|         state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; | ||||
|         return next_id; | ||||
|     } | ||||
|  | ||||
|     void add_rule( | ||||
|     static void add_rule( | ||||
|             parse_state & state, | ||||
|             uint32_t      rule_id, | ||||
|             const std::vector<llama_grammar_element> & rule) { | ||||
| @@ -46,11 +46,11 @@ namespace grammar_parser { | ||||
|         state.rules[rule_id] = rule; | ||||
|     } | ||||
|  | ||||
|     bool is_word_char(char c) { | ||||
|     static bool is_word_char(char c) { | ||||
|         return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9'); | ||||
|     } | ||||
|  | ||||
|     std::pair<uint32_t, const char *> parse_hex(const char * src, int size) { | ||||
|     static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) { | ||||
|         const char * pos   = src; | ||||
|         const char * end   = src + size; | ||||
|         uint32_t     value = 0; | ||||
| @@ -73,7 +73,7 @@ namespace grammar_parser { | ||||
|         return std::make_pair(value, pos); | ||||
|     } | ||||
|  | ||||
|     const char * parse_space(const char * src, bool newline_ok) { | ||||
|     static const char * parse_space(const char * src, bool newline_ok) { | ||||
|         const char * pos = src; | ||||
|         while (*pos == ' ' || *pos == '\t' || *pos == '#' || | ||||
|                 (newline_ok && (*pos == '\r' || *pos == '\n'))) { | ||||
| @@ -88,7 +88,7 @@ namespace grammar_parser { | ||||
|         return pos; | ||||
|     } | ||||
|  | ||||
|     const char * parse_name(const char * src) { | ||||
|     static const char * parse_name(const char * src) { | ||||
|         const char * pos = src; | ||||
|         while (is_word_char(*pos)) { | ||||
|             pos++; | ||||
| @@ -99,7 +99,7 @@ namespace grammar_parser { | ||||
|         return pos; | ||||
|     } | ||||
|  | ||||
|     std::pair<uint32_t, const char *> parse_char(const char * src) { | ||||
|     static std::pair<uint32_t, const char *> parse_char(const char * src) { | ||||
|         if (*src == '\\') { | ||||
|             switch (src[1]) { | ||||
|                 case 'x': return parse_hex(src + 2, 2); | ||||
| @@ -129,7 +129,7 @@ namespace grammar_parser { | ||||
|             uint32_t            rule_id, | ||||
|             bool                is_nested); | ||||
|  | ||||
|     const char * parse_sequence( | ||||
|     static const char * parse_sequence( | ||||
|             parse_state                        & state, | ||||
|             const char                         * src, | ||||
|             const std::string                  & rule_name, | ||||
| @@ -247,7 +247,7 @@ namespace grammar_parser { | ||||
|         return pos; | ||||
|     } | ||||
|  | ||||
|     const char * parse_rule(parse_state & state, const char * src) { | ||||
|     static const char * parse_rule(parse_state & state, const char * src) { | ||||
|         const char * name_end = parse_name(src); | ||||
|         const char * pos      = parse_space(name_end, false); | ||||
|         size_t       name_len = name_end - src; | ||||
| @@ -285,7 +285,7 @@ namespace grammar_parser { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void print_grammar_char(FILE * file, uint32_t c) { | ||||
|     static void print_grammar_char(FILE * file, uint32_t c) { | ||||
|         if (0x20 <= c && c <= 0x7f) { | ||||
|             fprintf(file, "%c", static_cast<char>(c)); | ||||
|         } else { | ||||
| @@ -294,7 +294,7 @@ namespace grammar_parser { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     bool is_char_element(llama_grammar_element elem) { | ||||
|     static bool is_char_element(llama_grammar_element elem) { | ||||
|         switch (elem.type) { | ||||
|             case LLAMA_GRETYPE_CHAR:           return true; | ||||
|             case LLAMA_GRETYPE_CHAR_NOT:       return true; | ||||
| @@ -304,7 +304,7 @@ namespace grammar_parser { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) { | ||||
|     static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) { | ||||
|         for (auto elem : rule) { | ||||
|             switch (elem.type) { | ||||
|                 case LLAMA_GRETYPE_END:            fprintf(file, "END");            break; | ||||
| @@ -334,7 +334,7 @@ namespace grammar_parser { | ||||
|         fprintf(file, "\n"); | ||||
|     } | ||||
|  | ||||
|     void print_rule( | ||||
|     static void print_rule( | ||||
|             FILE     * file, | ||||
|             uint32_t   rule_id, | ||||
|             const std::vector<llama_grammar_element> & rule, | ||||
|   | ||||
| @@ -9,12 +9,12 @@ | ||||
| #endif | ||||
|  | ||||
| #ifdef LLAMA_DEFAULT_RMS_EPS | ||||
| static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; | ||||
| constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; | ||||
| #else | ||||
| static const float rms_norm_eps = 5e-6f; | ||||
| constexpr float rms_norm_eps = 5e-6f; | ||||
| #endif | ||||
|  | ||||
| float frand() { | ||||
| static float frand() { | ||||
|     return (float)rand()/(float)RAND_MAX; | ||||
| } | ||||
|  | ||||
| @@ -25,19 +25,21 @@ struct random_normal_distribution { | ||||
|     float max; | ||||
| }; | ||||
|  | ||||
| void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) { | ||||
| static void init_random_normal_distribution( | ||||
|     struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max | ||||
| ) { | ||||
|     rnd->gen = std::mt19937(seed); | ||||
|     rnd->nd = std::normal_distribution<float>{mean, std}; | ||||
|     rnd->min = min; | ||||
|     rnd->max = max; | ||||
| } | ||||
|  | ||||
| float frand_normal(struct random_normal_distribution * rnd) { | ||||
| static float frand_normal(struct random_normal_distribution * rnd) { | ||||
|     const float r = rnd->nd(rnd->gen); | ||||
|     return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); | ||||
| } | ||||
|  | ||||
| void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { | ||||
| static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) { | ||||
|     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); | ||||
|  | ||||
|     if (plan.work_size > 0) { | ||||
| @@ -48,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, | ||||
|     ggml_graph_compute(graph, &plan); | ||||
| } | ||||
|  | ||||
| struct ggml_tensor * randomize_tensor( | ||||
|         struct ggml_tensor * tensor, | ||||
|         int ndims, | ||||
|         const int64_t ne[], | ||||
|         float fmin, | ||||
|         float fmax) { | ||||
|  | ||||
| static struct ggml_tensor * randomize_tensor( | ||||
|     struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax | ||||
| ) { | ||||
|     switch (ndims) { | ||||
|         case 1: | ||||
|             for (int i0 = 0; i0 < ne[0]; i0++) { | ||||
| @@ -95,11 +93,9 @@ struct ggml_tensor * randomize_tensor( | ||||
|     return tensor; | ||||
| } | ||||
|  | ||||
| struct ggml_tensor * randomize_tensor_normal( | ||||
|         struct ggml_tensor * tensor, | ||||
|         int ndims, | ||||
|         const int64_t ne[], | ||||
|         struct random_normal_distribution * rnd) { | ||||
| static struct ggml_tensor * randomize_tensor_normal( | ||||
|     struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd | ||||
| ) { | ||||
|     float scale = 1.0; // xavier | ||||
|     switch (ndims) { | ||||
|         case 1: | ||||
| @@ -159,7 +155,7 @@ struct llama_hparams { | ||||
|     } | ||||
| }; | ||||
|  | ||||
| uint32_t get_n_ff(const struct llama_hparams* hparams) { | ||||
| static uint32_t get_n_ff(const struct llama_hparams* hparams) { | ||||
|     const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; | ||||
|     return n_ff; | ||||
| } | ||||
| @@ -260,7 +256,7 @@ struct llama_model_lora { | ||||
|     std::vector<llama_layer_lora> layers; | ||||
| }; | ||||
|  | ||||
| void init_model(struct llama_model * model) { | ||||
| static void init_model(struct llama_model * model) { | ||||
|     const auto & hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_embd  = hparams.n_embd; | ||||
| @@ -297,7 +293,7 @@ void init_model(struct llama_model * model) { | ||||
| } | ||||
|  | ||||
|  | ||||
| void init_model_lora(struct llama_model_lora * model) { | ||||
| static void init_model_lora(struct llama_model_lora * model) { | ||||
|     const auto & hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_embd  = hparams.n_embd; | ||||
| @@ -340,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| void set_param_model(struct llama_model * model) { | ||||
| static void set_param_model(struct llama_model * model) { | ||||
|     const auto& hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_layer = hparams.n_layer; | ||||
| @@ -366,7 +362,7 @@ void set_param_model(struct llama_model * model) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| void set_param_model_lora(struct llama_model_lora * model) { | ||||
| static void set_param_model_lora(struct llama_model_lora * model) { | ||||
|     const auto& hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_layer = hparams.n_layer; | ||||
| @@ -397,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { | ||||
| static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) { | ||||
|     const auto & hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_layer = hparams.n_layer; | ||||
| @@ -426,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std | ||||
| } | ||||
|  | ||||
|  | ||||
| void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) { | ||||
| static void randomize_model_lora( | ||||
|     struct llama_model_lora * model, int seed, float mean, float std, float min, float max | ||||
| ) { | ||||
|     const auto & hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_layer = hparams.n_layer; | ||||
| @@ -459,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, | ||||
|     } | ||||
| } | ||||
|  | ||||
| bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { | ||||
| static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) { | ||||
|     const auto & hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_ctx   = hparams.n_ctx; | ||||
| @@ -495,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { | ||||
| static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) { | ||||
|     const auto & hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_ctx   = hparams.n_ctx; | ||||
| @@ -531,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| struct ggml_tensor * forward( | ||||
|         struct llama_model    * model, | ||||
|         struct llama_kv_cache * cache, | ||||
|         struct ggml_context   * ctx0, | ||||
|         struct ggml_cgraph    * gf, | ||||
|         struct ggml_tensor    * tokens_input, | ||||
|         const  int              n_tokens, | ||||
|         const  int              n_past) { | ||||
|  | ||||
| static struct ggml_tensor * forward( | ||||
|     struct llama_model    * model, | ||||
|     struct llama_kv_cache * cache, | ||||
|     struct ggml_context   * ctx0, | ||||
|     struct ggml_cgraph    * gf, | ||||
|     struct ggml_tensor    * tokens_input, | ||||
|     const  int              n_tokens, | ||||
|     const  int              n_past | ||||
| ) { | ||||
|     const int N = n_tokens; | ||||
|  | ||||
|     struct llama_kv_cache& kv_self = *cache; | ||||
| @@ -756,25 +754,25 @@ struct ggml_tensor * forward( | ||||
|     return inpL; | ||||
| } | ||||
|  | ||||
| void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { | ||||
| static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { | ||||
|     GGML_ASSERT(tensor->n_dims == 1); | ||||
|     GGML_ASSERT(tensor->ne[0] == ne0); | ||||
| } | ||||
|  | ||||
| void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { | ||||
| static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { | ||||
|     GGML_ASSERT(tensor->n_dims == 2); | ||||
|     GGML_ASSERT(tensor->ne[0] == ne0); | ||||
|     GGML_ASSERT(tensor->ne[1] == ne1); | ||||
| } | ||||
|  | ||||
| void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { | ||||
| static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { | ||||
|     GGML_ASSERT(tensor->n_dims == 3); | ||||
|     GGML_ASSERT(tensor->ne[0] == ne0); | ||||
|     GGML_ASSERT(tensor->ne[1] == ne1); | ||||
|     GGML_ASSERT(tensor->ne[2] == ne2); | ||||
| } | ||||
|  | ||||
| void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { | ||||
| static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { | ||||
|     GGML_ASSERT(tensor->n_dims == 4); | ||||
|     GGML_ASSERT(tensor->ne[0] == ne0); | ||||
|     GGML_ASSERT(tensor->ne[1] == ne1); | ||||
| @@ -782,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6 | ||||
|     GGML_ASSERT(tensor->ne[3] == ne3); | ||||
| } | ||||
|  | ||||
| struct ggml_tensor * forward_batch( | ||||
|         struct llama_model    * model, | ||||
|         struct llama_kv_cache * cache, | ||||
|         struct ggml_context   * ctx0, | ||||
|         struct ggml_cgraph    * gf, | ||||
|         struct ggml_tensor    * tokens_input, | ||||
|         const  int              n_tokens, | ||||
|         const  int              n_past, | ||||
|         const  int              n_batch) { | ||||
|  | ||||
| static struct ggml_tensor * forward_batch( | ||||
|     struct llama_model    * model, | ||||
|     struct llama_kv_cache * cache, | ||||
|     struct ggml_context   * ctx0, | ||||
|     struct ggml_cgraph    * gf, | ||||
|     struct ggml_tensor    * tokens_input, | ||||
|     const  int              n_tokens, | ||||
|     const  int              n_past, | ||||
|     const  int              n_batch | ||||
| ) { | ||||
|     const int N = n_tokens; | ||||
|  | ||||
|     struct llama_kv_cache& kv_self = *cache; | ||||
| @@ -1073,16 +1071,15 @@ struct ggml_tensor * forward_batch( | ||||
|     return inpL; | ||||
| } | ||||
|  | ||||
|  | ||||
| struct ggml_tensor * forward_lora( | ||||
|         struct llama_model_lora * model, | ||||
|         struct llama_kv_cache   * cache, | ||||
|         struct ggml_context     * ctx0, | ||||
|         struct ggml_cgraph      * gf, | ||||
|         struct ggml_tensor      * tokens_input, | ||||
|         const  int                n_tokens, | ||||
|         const  int                n_past) { | ||||
|  | ||||
| static struct ggml_tensor * forward_lora( | ||||
|     struct llama_model_lora * model, | ||||
|     struct llama_kv_cache   * cache, | ||||
|     struct ggml_context     * ctx0, | ||||
|     struct ggml_cgraph      * gf, | ||||
|     struct ggml_tensor      * tokens_input, | ||||
|     const  int                n_tokens, | ||||
|     const  int                n_past | ||||
| ) { | ||||
|     const int N = n_tokens; | ||||
|  | ||||
|     struct llama_kv_cache& kv_self = *cache; | ||||
| @@ -1328,7 +1325,7 @@ struct ggml_tensor * forward_lora( | ||||
|     return inpL; | ||||
| } | ||||
|  | ||||
| void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { | ||||
| static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { | ||||
|     assert(logits->n_dims == 2); | ||||
|     assert(probs->n_dims == 2); | ||||
|     assert(best_samples->n_dims == 1); | ||||
| @@ -1359,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str | ||||
|     } | ||||
| } | ||||
|  | ||||
| void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { | ||||
| static void sample_softmax_batch( | ||||
|     struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, | ||||
|     struct ggml_tensor * best_samples | ||||
| ) { | ||||
|     GGML_ASSERT(best_samples->n_dims == 2); | ||||
|     GGML_ASSERT(logits->n_dims == 3); | ||||
|     GGML_ASSERT(probs->n_dims == 3); | ||||
| @@ -1393,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits | ||||
|     } | ||||
| } | ||||
|  | ||||
| void print_row(struct ggml_tensor * probs, int i) { | ||||
| static void print_row(struct ggml_tensor * probs, int i) { | ||||
|     for (int k = 0; k < probs->ne[0]; ++k) { | ||||
|         float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); | ||||
|         printf(" %.2f", p); | ||||
| @@ -1401,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) { | ||||
|     printf("\n"); | ||||
| } | ||||
|  | ||||
| void print_matrix(struct ggml_tensor * probs) { | ||||
| static void print_matrix(struct ggml_tensor * probs) { | ||||
|     assert(probs->n_dims == 2); | ||||
|     for (int i = 0; i < probs->ne[1]; ++i) { | ||||
|         for (int k = 0; k < probs->ne[0]; ++k) { | ||||
| @@ -1412,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| void print_token(int token, int n_vocab) { | ||||
| static void print_token(int token, int n_vocab) { | ||||
|     for (int k = 0; k < token; ++k) { | ||||
|         printf(" "); | ||||
|     } | ||||
| @@ -1423,14 +1423,14 @@ void print_token(int token, int n_vocab) { | ||||
|     printf("\n"); | ||||
| } | ||||
|  | ||||
| void print_tokens(struct ggml_tensor * tokens, int n_vocab) { | ||||
| static void print_tokens(struct ggml_tensor * tokens, int n_vocab) { | ||||
|     for (int i=0; i<tokens->ne[0]; ++i) { | ||||
|         int token = ggml_get_i32_1d(tokens, i); | ||||
|         print_token(token, n_vocab); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { | ||||
| static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { | ||||
|     int n_tokens = tokens_input->ne[0]; | ||||
|     int n_vocab = targets->ne[0]; | ||||
|     float randomness = 0.0f; | ||||
| @@ -1451,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru | ||||
|     } | ||||
| } | ||||
|  | ||||
| void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { | ||||
| static void get_example_targets_batch( | ||||
|     struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets | ||||
| ) { | ||||
|     GGML_ASSERT(tokens_input->n_dims == 2); | ||||
|     GGML_ASSERT(     targets->n_dims == 3); | ||||
|     int n_tokens = tokens_input->ne[0]; | ||||
| @@ -1474,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct | ||||
|     } | ||||
| } | ||||
|  | ||||
| void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { | ||||
| static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) { | ||||
|     int n_tokens = tokens_input->ne[0]; | ||||
|     int n_vocab = targets->ne[0]; | ||||
|     for (int i=0; i<n_tokens-n_shift; ++i) { | ||||
| @@ -1485,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar | ||||
|     } | ||||
| } | ||||
|  | ||||
| struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { | ||||
| static struct ggml_tensor * square_error_loss( | ||||
|     struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b | ||||
| ) { | ||||
|     // todo: instead of a-b: a[1:]-b[:-1] | ||||
|     return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b))); | ||||
| } | ||||
|  | ||||
| struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { | ||||
| static struct ggml_tensor * cross_entropy_loss( | ||||
|     struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b | ||||
| ) { | ||||
|     const float eps = 1e-3f; | ||||
|     return | ||||
|         ggml_sum(ctx, | ||||
|   | ||||
| @@ -30,7 +30,8 @@ struct ostream_beam_view { | ||||
|     llama_context * ctx; | ||||
|     llama_beam_view beam_view; | ||||
| }; | ||||
| std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) { | ||||
|  | ||||
| static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) { | ||||
|     os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; | ||||
|     for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { | ||||
|         os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); | ||||
| @@ -46,7 +47,7 @@ struct beam_search_callback_data { | ||||
|  | ||||
| // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. | ||||
| // For example, eob can be flagged due to maximum token length, stop words, etc. | ||||
| bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) { | ||||
| static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { | ||||
|     return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); | ||||
| } | ||||
|  | ||||
| @@ -56,7 +57,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke | ||||
| //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. | ||||
| //    This is also called when the stop condition is met. | ||||
| //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. | ||||
| void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { | ||||
| static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { | ||||
|     auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr); | ||||
|     // Mark beams as EOS as needed. | ||||
|     for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { | ||||
|   | ||||
| @@ -115,7 +115,7 @@ struct TransformerWeights { | ||||
|     } | ||||
| }; | ||||
|  | ||||
| void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { | ||||
| static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { | ||||
|     // we calloc instead of malloc to keep valgrind happy | ||||
|     w->token_embedding_table = new float[p->vocab_size * p->dim](); | ||||
|     printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); | ||||
| @@ -158,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { | ||||
| static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { | ||||
|     if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1; | ||||
|     if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1; | ||||
|     if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1; | ||||
| @@ -189,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| void print_sample_weights(TransformerWeights *w){ | ||||
| static void print_sample_weights(TransformerWeights *w){ | ||||
|     printf("----- Quick print of first of the weight vales of all the variables\n"); | ||||
|     printf("%f\n", w->token_embedding_table[0]); | ||||
|     printf("%f\n", w->rms_att_weight[0]); | ||||
| @@ -324,7 +324,7 @@ struct train_params { | ||||
|     int mem_compute1_gb; | ||||
| }; | ||||
|  | ||||
| void print_params(struct my_llama_hparams * params) { | ||||
| static void print_params(struct my_llama_hparams * params) { | ||||
|     printf("%s: n_vocab: %d\n", __func__, params->n_vocab); | ||||
|     printf("%s: n_ctx:   %d\n", __func__, params->n_ctx); | ||||
|     printf("%s: n_embd:  %d\n", __func__, params->n_embd); | ||||
| @@ -335,7 +335,7 @@ void print_params(struct my_llama_hparams * params) { | ||||
|     printf("%s: n_rot:   %d\n", __func__, params->n_rot); | ||||
| } | ||||
|  | ||||
| void init_model(struct my_llama_model * model) { | ||||
| static void init_model(struct my_llama_model * model) { | ||||
|     const auto & hparams = model->hparams; | ||||
|  | ||||
|     const uint32_t n_embd  = hparams.n_embd; | ||||
| @@ -408,17 +408,17 @@ void init_model(struct my_llama_model * model) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { | ||||
| static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { | ||||
|     float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); | ||||
|     return *ptr; | ||||
| } | ||||
|  | ||||
| int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { | ||||
| static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { | ||||
|     int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]); | ||||
|     return *ptr; | ||||
| } | ||||
|  | ||||
| void print_row(struct ggml_tensor * probs, int i) { | ||||
| static void print_row(struct ggml_tensor * probs, int i) { | ||||
|     for (int k = 0; k < probs->ne[0]; ++k) { | ||||
|         float p = get_f32_2d(probs, k, i); | ||||
|         printf(" %f", p); | ||||
| @@ -426,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) { | ||||
|     printf("\n"); | ||||
| } | ||||
|  | ||||
| void print_matrix(struct ggml_tensor * probs) { | ||||
| static void print_matrix(struct ggml_tensor * probs) { | ||||
|     assert(probs->n_dims == 2); | ||||
|     for (int i = 0; i < probs->ne[1]; ++i) { | ||||
|         for (int k = 0; k < probs->ne[0]; ++k) { | ||||
| @@ -531,7 +531,7 @@ struct llama_file { | ||||
|     } | ||||
| }; | ||||
|  | ||||
| bool is_ggml_file(const char *filename) { | ||||
| static bool is_ggml_file(const char * filename) { | ||||
|     llama_file file(filename, "rb"); | ||||
|     if (file.size < 4) { | ||||
|         return false; | ||||
| @@ -540,7 +540,7 @@ bool is_ggml_file(const char *filename) { | ||||
|     return magic == GGUF_MAGIC; | ||||
| } | ||||
|  | ||||
| static std::string llama_escape_whitespaces(const std::string& text) { | ||||
| static std::string llama_escape_whitespaces(const std::string & text) { | ||||
|     std::ostringstream out; | ||||
|     for (char c : text) { | ||||
|         if (c == ' ') out << "\xe2\x96\x81"; | ||||
| @@ -549,7 +549,7 @@ static std::string llama_escape_whitespaces(const std::string& text) { | ||||
|     return out.str(); | ||||
| } | ||||
|  | ||||
| void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { | ||||
| static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { | ||||
|     if (is_ggml_file(filename)) { | ||||
|         struct ggml_context * ctx_data = NULL; | ||||
|  | ||||
| @@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) | ||||
|     } | ||||
| } | ||||
|  | ||||
| void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { | ||||
| static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { | ||||
|     int ct; | ||||
|     switch (gg_weights->n_dims){ | ||||
|         case 1: | ||||
| @@ -673,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar | ||||
|     } | ||||
| } | ||||
|  | ||||
| void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { | ||||
| static void save_as_llama_model( | ||||
|     struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename | ||||
| ) { | ||||
|     // convert AK weights into GG weights one by one. | ||||
|     // w->token_embedding_table -> model->tok_embeddings | ||||
|     // float*                   -> struct ggml_tensor | ||||
| @@ -785,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod | ||||
|     gguf_free(ctx); | ||||
| } | ||||
|  | ||||
| struct train_params get_default_train_params() { | ||||
| static struct train_params get_default_train_params() { | ||||
|     struct train_params params; | ||||
|     params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf"; | ||||
|     params.fn_llama2c_output_model = "ak_llama_model.bin"; | ||||
| @@ -835,7 +837,7 @@ struct train_params get_default_train_params() { | ||||
|     return params; | ||||
| } | ||||
|  | ||||
| void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { | ||||
| static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) { | ||||
|     fprintf(stderr, "usage: %s [options]\n", argv[0]); | ||||
|     fprintf(stderr, "\n"); | ||||
|     fprintf(stderr, "options:\n"); | ||||
| @@ -846,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params) | ||||
|     fprintf(stderr, "\n"); | ||||
| } | ||||
|  | ||||
| bool params_parse(int argc, char ** argv, struct train_params * params) { | ||||
| static bool params_parse(int argc, char ** argv, struct train_params * params) { | ||||
|     bool invalid_param = false; | ||||
|     bool reqd_param_found = false; | ||||
|     std::string arg; | ||||
| @@ -901,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| std::string basename(const std::string &path) { | ||||
| static std::string basename(const std::string &path) { | ||||
|     size_t pos = path.find_last_of("/\\"); | ||||
|     if (pos == std::string::npos) { | ||||
|         return path; | ||||
|   | ||||
| @@ -13,14 +13,14 @@ | ||||
| #define MIN(a, b) ((a) < (b) ? (a) : (b)) | ||||
| #define MAX(a, b) ((a) > (b) ? (a) : (b)) | ||||
|  | ||||
| template<typename T> | ||||
| template <typename T> | ||||
| static std::string to_string(const T & val) { | ||||
|     std::stringstream ss; | ||||
|     ss << val; | ||||
|     return ss.str(); | ||||
| } | ||||
|  | ||||
| bool gguf_ex_write(const std::string & fname) { | ||||
| static bool gguf_ex_write(const std::string & fname) { | ||||
|     struct gguf_context * ctx = gguf_init_empty(); | ||||
|  | ||||
|     gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12); | ||||
| @@ -85,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) { | ||||
| } | ||||
|  | ||||
| // just read tensor info | ||||
| bool gguf_ex_read_0(const std::string & fname) { | ||||
| static bool gguf_ex_read_0(const std::string & fname) { | ||||
|     struct gguf_init_params params = { | ||||
|         /*.no_alloc = */ false, | ||||
|         /*.ctx      = */ NULL, | ||||
| @@ -143,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) { | ||||
| } | ||||
|  | ||||
| // read and create ggml_context containing the tensors and their data | ||||
| bool gguf_ex_read_1(const std::string & fname) { | ||||
| static bool gguf_ex_read_1(const std::string & fname) { | ||||
|     struct ggml_context * ctx_data = NULL; | ||||
|  | ||||
|     struct gguf_init_params params = { | ||||
|   | ||||
| @@ -41,7 +41,8 @@ static std::ostringstream       * g_output_ss; | ||||
| static std::vector<llama_token> * g_output_tokens; | ||||
| static bool is_interacting = false; | ||||
|  | ||||
| void write_logfile( | ||||
|  | ||||
| static void write_logfile( | ||||
|     const llama_context * ctx, const gpt_params & params, const llama_model * model, | ||||
|     const std::vector<llama_token> & input_tokens, const std::string & output, | ||||
|     const std::vector<llama_token> & output_tokens | ||||
| @@ -86,7 +87,7 @@ void write_logfile( | ||||
| } | ||||
|  | ||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) | ||||
| void sigint_handler(int signo) { | ||||
| static void sigint_handler(int signo) { | ||||
|     if (signo == SIGINT) { | ||||
|         if (!is_interacting) { | ||||
|             is_interacting = true; | ||||
|   | ||||
| @@ -28,9 +28,10 @@ struct results_log_softmax { | ||||
|     float  prob; | ||||
| }; | ||||
|  | ||||
| void write_logfile(const llama_context * ctx, const gpt_params & params, | ||||
|                    const llama_model * model, const struct results_perplexity & results) { | ||||
|  | ||||
| static void write_logfile( | ||||
|     const llama_context * ctx, const gpt_params & params, const llama_model * model, | ||||
|     const struct results_perplexity & results | ||||
| ) { | ||||
|     if (params.logdir.empty()) { | ||||
|         return; | ||||
|     } | ||||
| @@ -76,7 +77,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params, | ||||
|     fclose(logfile); | ||||
| } | ||||
|  | ||||
| std::vector<float> softmax(const std::vector<float>& logits) { | ||||
| static std::vector<float> softmax(const std::vector<float>& logits) { | ||||
|     std::vector<float> probs(logits.size()); | ||||
|     float max_logit = logits[0]; | ||||
|     for (float v : logits) max_logit = std::max(max_logit, v); | ||||
| @@ -92,7 +93,7 @@ std::vector<float> softmax(const std::vector<float>& logits) { | ||||
|     return probs; | ||||
| } | ||||
|  | ||||
| results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { | ||||
| static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { | ||||
|     float max_logit = logits[0]; | ||||
|     for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); | ||||
|     double sum_exp = 0.0; | ||||
| @@ -100,9 +101,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { | ||||
|     return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; | ||||
| } | ||||
|  | ||||
| void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, | ||||
|         double & nll, double & nll2, float * logit_history, float * prob_history) { | ||||
|  | ||||
| static void process_logits( | ||||
|     int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, | ||||
|     double & nll, double & nll2, float * logit_history, float * prob_history | ||||
| ) { | ||||
|     std::mutex mutex; | ||||
|     int counter = 0; | ||||
|     auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { | ||||
| @@ -130,7 +132,7 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n | ||||
|  | ||||
| } | ||||
|  | ||||
| results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { | ||||
| static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { | ||||
|     // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research | ||||
|     // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` | ||||
|     // Output: `perplexity: 13.5106 [114/114]` | ||||
| @@ -260,8 +262,7 @@ results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) | ||||
|     return {tokens, std::exp(nll / count), logit_history, prob_history}; | ||||
| } | ||||
|  | ||||
| results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { | ||||
|  | ||||
| static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { | ||||
|     if (params.ppl_stride > 0) { | ||||
|         return perplexity_v2(ctx, params); | ||||
|     } | ||||
| @@ -400,8 +401,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { | ||||
|     return {tokens, ppl, logit_history, prob_history}; | ||||
| } | ||||
|  | ||||
| std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, | ||||
|         int n_vocab, int n_thread) { | ||||
| static std::vector<float> hellaswag_evaluate_tokens( | ||||
|     llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread | ||||
| ) { | ||||
|     std::vector<float> result; | ||||
|     result.reserve(tokens.size() * n_vocab); | ||||
|     size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch; | ||||
| @@ -421,7 +423,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| void hellaswag_score(llama_context * ctx, const gpt_params & params) { | ||||
| static void hellaswag_score(llama_context * ctx, const gpt_params & params) { | ||||
|     // Calculates hellaswag score (acc_norm) from prompt | ||||
|     // | ||||
|     // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl | ||||
|   | ||||
| @@ -34,8 +34,8 @@ struct quantize_stats_params { | ||||
|     std::vector<enum ggml_type> include_types; | ||||
| }; | ||||
|  | ||||
| const size_t HISTOGRAM_BUCKETS = 150; | ||||
| const double HISTOGRAM_RANGE = 0.03; | ||||
| constexpr size_t HISTOGRAM_BUCKETS = 150; | ||||
| constexpr double HISTOGRAM_RANGE = 0.03; | ||||
|  | ||||
| struct error_stats { | ||||
|     size_t num_samples; | ||||
| @@ -44,8 +44,7 @@ struct error_stats { | ||||
|     uint64_t error_histogram[HISTOGRAM_BUCKETS]; | ||||
| }; | ||||
|  | ||||
|  | ||||
| void quantize_stats_print_usage(int /*argc*/, char ** argv) { | ||||
| static void quantize_stats_print_usage(int /*argc*/, char ** argv) { | ||||
|     quantize_stats_params params; | ||||
|     fprintf(stderr, "usage: %s [options]\n", argv[0]); | ||||
|     fprintf(stderr, "\n"); | ||||
| @@ -71,7 +70,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { | ||||
| } | ||||
|  | ||||
| // Check if a layer is included/excluded by command line | ||||
| bool layer_included(const quantize_stats_params & params, const std::string & layer) { | ||||
| static bool layer_included(const quantize_stats_params & params, const std::string & layer) { | ||||
|     for (const auto& excluded : params.exclude_layers) { | ||||
|         if (std::regex_search(layer, std::regex(excluded))) { | ||||
|             return false; | ||||
| @@ -86,7 +85,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la | ||||
| } | ||||
|  | ||||
| // Update error statistics given vectors with the before/after result of quantization | ||||
| void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { | ||||
| static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { | ||||
|     for (int64_t i = 0; i < nelements; i++) { | ||||
|         double diff = input[i] - output[i]; | ||||
|         stats.total_error += diff * diff; | ||||
| @@ -96,14 +95,14 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou | ||||
|     stats.num_samples += nelements; | ||||
| } | ||||
|  | ||||
| void combine_error_stats(error_stats & into, const error_stats & from) { | ||||
| static void combine_error_stats(error_stats & into, const error_stats & from) { | ||||
|     into.num_samples += from.num_samples; | ||||
|     into.total_error += from.total_error; | ||||
|     if (from.max_error > into.max_error) into.max_error = from.max_error; | ||||
|     for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i]; | ||||
| } | ||||
|  | ||||
| double find_quantile(const error_stats & stats, double quantile) { | ||||
| static double find_quantile(const error_stats & stats, double quantile) { | ||||
|     double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0); | ||||
|  | ||||
|     double accum = 0; | ||||
| @@ -116,7 +115,7 @@ double find_quantile(const error_stats & stats, double quantile) { | ||||
|     return INFINITY; | ||||
| } | ||||
|  | ||||
| void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { | ||||
| static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { | ||||
|     double rmse = sqrt(stats.total_error / (double) stats.num_samples); | ||||
|     double median = find_quantile(stats, .5); | ||||
|     double pct95 = find_quantile(stats, .95); | ||||
| @@ -143,17 +142,10 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { | ||||
|         tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; | ||||
| } | ||||
|  | ||||
| void test_roundtrip_on_chunk( | ||||
|         const ggml_tensor * layer, | ||||
|         int64_t offset, | ||||
|         int64_t chunk_size, | ||||
|         const ggml_type_traits_t & qfns, | ||||
|         bool use_reference, | ||||
|         float * input_scratch, | ||||
|         char * quantized_scratch, | ||||
|         float * output_scratch, | ||||
|         error_stats & stats) { | ||||
|  | ||||
| static void test_roundtrip_on_chunk( | ||||
|     const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference, | ||||
|     float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats | ||||
| ) { | ||||
|     if (layer->type == GGML_TYPE_F16) { | ||||
|         for (int i = 0; i < chunk_size; i++) { | ||||
|             input_scratch[i] = ggml_get_f32_1d(layer, i + offset); | ||||
| @@ -174,18 +166,11 @@ void test_roundtrip_on_chunk( | ||||
|  | ||||
|  | ||||
| // Run quantization function for a single layer and update error stats | ||||
| void test_roundtrip_on_layer( | ||||
|         std::string & name, | ||||
|         bool print_layer_stats, | ||||
|         const ggml_type_traits_t & qfns, | ||||
|         bool use_reference, | ||||
|         const ggml_tensor * layer, | ||||
|         std::vector<float> & input_scratch, | ||||
|         std::vector<char> & quantized_scratch, | ||||
|         std::vector<float> & output_scratch, | ||||
|         error_stats & total_error, | ||||
|         int max_thread = 0) { | ||||
|  | ||||
| static void test_roundtrip_on_layer( | ||||
|     std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference, | ||||
|     const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch, | ||||
|     std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0 | ||||
| ) { | ||||
|     assert(tensor_is_contiguous(layer)); | ||||
|     error_stats layer_error {}; | ||||
|     uint64_t nelements = ggml_nelements(layer); | ||||
|   | ||||
| @@ -40,7 +40,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = { | ||||
| }; | ||||
|  | ||||
|  | ||||
| bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { | ||||
| static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { | ||||
|     std::string ftype_str; | ||||
|  | ||||
|     for (auto ch : ftype_str_in) { | ||||
| @@ -72,7 +72,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: | ||||
| // usage: | ||||
| //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] | ||||
| // | ||||
| void usage(const char * executable) { | ||||
| static void usage(const char * executable) { | ||||
|     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); | ||||
|     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); | ||||
|     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); | ||||
|   | ||||
| @@ -1083,8 +1083,9 @@ static json format_final_response(llama_server_context &llama, const std::string | ||||
|     return res; | ||||
| } | ||||
|  | ||||
| static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs) | ||||
| { | ||||
| static json format_partial_response( | ||||
|     llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs | ||||
| ) { | ||||
|     json res = json{ | ||||
|         {"content", content}, | ||||
|         {"stop", false}, | ||||
| @@ -1215,7 +1216,7 @@ static void log_server_request(const Request &req, const Response &res) | ||||
|                            }); | ||||
| } | ||||
|  | ||||
| bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) { | ||||
| static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) { | ||||
|     return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx); | ||||
| } | ||||
|  | ||||
| @@ -1225,7 +1226,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens | ||||
| //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. | ||||
| //    This is also called when the stop condition is met. | ||||
| //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data. | ||||
| void beam_search_callback(void * callback_data, llama_beams_state beams_state) { | ||||
| static void beam_search_callback(void *callback_data, llama_beams_state beams_state) { | ||||
|     auto & llama = *static_cast<llama_server_context*>(callback_data); | ||||
|     // Mark beams as EOS as needed. | ||||
|     for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { | ||||
| @@ -1258,7 +1259,8 @@ struct token_translator { | ||||
|     std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); } | ||||
| }; | ||||
|  | ||||
| void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) { | ||||
| static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama) | ||||
| { | ||||
|     auto & gtps = llama.generated_token_probs; | ||||
|     auto translator = token_translator{llama.ctx}; | ||||
|     auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); }; | ||||
|   | ||||
							
								
								
									
										19
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										19
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -1,3 +1,4 @@ | ||||
| #define LLAMA_API_INTERNAL | ||||
| #include "llama.h" | ||||
|  | ||||
| #include "ggml.h" | ||||
| @@ -108,7 +109,7 @@ static size_t utf8_len(char src) { | ||||
|     return lookup[highbits]; | ||||
| } | ||||
|  | ||||
| void replace_all(std::string & s, const std::string & search, const std::string & replace) { | ||||
| static void replace_all(std::string & s, const std::string & search, const std::string & replace) { | ||||
|     std::string result; | ||||
|     for (size_t pos = 0; ; pos += search.length()) { | ||||
|         auto new_pos = s.find(search, pos); | ||||
| @@ -1589,7 +1590,7 @@ struct llama_model_loader { | ||||
| // load LLaMA models | ||||
| // | ||||
|  | ||||
| std::string llama_model_ftype_name(enum llama_ftype ftype) { | ||||
| static std::string llama_model_ftype_name(enum llama_ftype ftype) { | ||||
|     if (ftype & LLAMA_FTYPE_GUESSED) { | ||||
|         return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)"; | ||||
|     } | ||||
| @@ -4295,7 +4296,7 @@ struct llama_grammar_candidate { | ||||
|  | ||||
| // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as | ||||
| // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. | ||||
| std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8( | ||||
| static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8( | ||||
|         const char         * src, | ||||
|         llama_partial_utf8   partial_start) { | ||||
|     static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; | ||||
| @@ -5893,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
| } | ||||
|  | ||||
| // TODO: after the GGUF PR, this likely won't work and needs to be updated | ||||
| int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { | ||||
| static int llama_apply_lora_from_file_internal( | ||||
|     const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads | ||||
| ) { | ||||
|     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); | ||||
|  | ||||
|     const int64_t t_start_lora_us = ggml_time_us(); | ||||
| @@ -6440,7 +6443,7 @@ struct llama_context * llama_new_context_with_model( | ||||
|     return ctx; | ||||
| } | ||||
|  | ||||
| struct llama_context * llama_init_from_file( | ||||
| static struct llama_context * llama_init_from_file( | ||||
|                              const char * path_model, | ||||
|             struct llama_context_params   params) { | ||||
|     struct llama_model * model = llama_load_model_from_file(path_model, params); | ||||
| @@ -6645,7 +6648,7 @@ struct llama_data_file_context : llama_data_context { | ||||
|  * llama_copy_state_data(ctx, &data_ctx); | ||||
|  * | ||||
| */ | ||||
| void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { | ||||
| static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { | ||||
|     // copy rng | ||||
|     { | ||||
|         std::stringstream rng_ss; | ||||
| @@ -7183,7 +7186,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) { | ||||
| } | ||||
|  | ||||
| // For internal test use | ||||
| const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) { | ||||
| const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map( | ||||
|     struct llama_context * ctx | ||||
| ) { | ||||
|     return ctx->model.tensors_by_name; | ||||
| } | ||||
|  | ||||
|   | ||||
							
								
								
									
										4
									
								
								llama.h
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								llama.h
									
									
									
									
									
								
							| @@ -540,7 +540,9 @@ extern "C" { | ||||
|  | ||||
| struct ggml_tensor; | ||||
|  | ||||
| const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx); | ||||
| const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map( | ||||
|     struct llama_context * ctx | ||||
| ); | ||||
|  | ||||
| #endif // LLAMA_API_INTERNAL | ||||
|  | ||||
|   | ||||
| @@ -16,7 +16,7 @@ | ||||
|  | ||||
| constexpr int kVecSize = 1 << 18; | ||||
|  | ||||
| float drawFromGaussianPdf(std::mt19937& rndm) { | ||||
| static float drawFromGaussianPdf(std::mt19937& rndm) { | ||||
|     constexpr double kScale = 1./(1. + std::mt19937::max()); | ||||
|     constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale; | ||||
|     static float lastX; | ||||
| @@ -28,7 +28,8 @@ float drawFromGaussianPdf(std::mt19937& rndm) { | ||||
|     haveX = true; | ||||
|     return r*cos(phi); | ||||
| } | ||||
| void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) { | ||||
|  | ||||
| static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) { | ||||
|     for (auto& v : values) v = mean + drawFromGaussianPdf(rndm); | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -36,15 +36,15 @@ | ||||
| #define GGML_PRINT(...) printf(__VA_ARGS__) | ||||
|  | ||||
|  | ||||
| float frand(void) { | ||||
| static float frand(void) { | ||||
|     return (float)rand()/(float)RAND_MAX; | ||||
| } | ||||
|  | ||||
| int irand(int n) { | ||||
| static int irand(int n) { | ||||
|     return rand()%n; | ||||
| } | ||||
|  | ||||
| void get_random_dims(int64_t * dims, int ndims) { | ||||
| static void get_random_dims(int64_t * dims, int ndims) { | ||||
|     dims[0] = dims[1] = dims[2] = dims[3] = 1; | ||||
|  | ||||
|     for (int i = 0; i < ndims; i++) { | ||||
| @@ -52,7 +52,7 @@ void get_random_dims(int64_t * dims, int ndims) { | ||||
|     } | ||||
| } | ||||
|  | ||||
| void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { | ||||
| static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { | ||||
|     dims[0] = dims[1] = dims[2] = dims[3] = 1; | ||||
|  | ||||
|     for (int i = 0; i < ndims; i++) { | ||||
| @@ -61,12 +61,9 @@ void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) { | ||||
| } | ||||
|  | ||||
|  | ||||
| struct ggml_tensor * get_random_tensor( | ||||
|         struct ggml_context * ctx0, | ||||
|         int ndims, | ||||
|         int64_t ne[], | ||||
|         float fmin, | ||||
|         float fmax) { | ||||
| static struct ggml_tensor * get_random_tensor( | ||||
|     struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax | ||||
| ) { | ||||
|     struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne); | ||||
|  | ||||
|     switch (ndims) { | ||||
| @@ -109,11 +106,11 @@ struct ggml_tensor * get_random_tensor( | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| float get_element(const struct ggml_tensor * t, int idx) { | ||||
| static float get_element(const struct ggml_tensor * t, int idx) { | ||||
|     return ((float *)t->data)[idx]; | ||||
| } | ||||
|  | ||||
| void set_element(struct ggml_tensor * t, int idx, float value) { | ||||
| static void set_element(struct ggml_tensor * t, int idx, float value) { | ||||
|     ((float *)t->data)[idx] = value; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -13,24 +13,24 @@ | ||||
| #pragma warning(disable: 4244 4267) // possible loss of data | ||||
| #endif | ||||
|  | ||||
| const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; | ||||
| const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; | ||||
| const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; | ||||
| const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; | ||||
| const float MAX_DOT_PRODUCT_ERROR = 0.02f; | ||||
| constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; | ||||
| constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; | ||||
| constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; | ||||
| constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; | ||||
| constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f; | ||||
|  | ||||
| const char* RESULT_STR[] = {"ok", "FAILED"}; | ||||
| static const char* RESULT_STR[] = {"ok", "FAILED"}; | ||||
|  | ||||
|  | ||||
| // Generate synthetic data | ||||
| void generate_data(float offset, size_t n, float * dst) { | ||||
| static void generate_data(float offset, size_t n, float * dst) { | ||||
|     for (size_t i = 0; i < n; i++) { | ||||
|         dst[i] = 0.1 + 2*cosf(i + offset); | ||||
|     } | ||||
| } | ||||
|  | ||||
| // Calculate RMSE between two float arrays | ||||
| float array_rmse(const float * a1, const float * a2, size_t n) { | ||||
| static float array_rmse(const float * a1, const float * a2, size_t n) { | ||||
|     double sum = 0; | ||||
|     for (size_t i = 0; i < n; i++) { | ||||
|         double diff = a1[i] - a2[i]; | ||||
| @@ -40,7 +40,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) { | ||||
| } | ||||
|  | ||||
| // Total quantization error on test data | ||||
| float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { | ||||
| static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { | ||||
|     std::vector<uint8_t> tmp_q(2*test_size); | ||||
|     std::vector<float> tmp_out(test_size); | ||||
|  | ||||
| @@ -50,7 +50,7 @@ float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, cons | ||||
| } | ||||
|  | ||||
| // Total quantization error on test data | ||||
| float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { | ||||
| static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) { | ||||
|     std::vector<uint8_t> tmp_q(2*test_size); | ||||
|     std::vector<float> tmp_out(test_size); | ||||
|     std::vector<float> tmp_out_ref(test_size); | ||||
| @@ -64,7 +64,7 @@ float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, | ||||
|     return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size); | ||||
| } | ||||
|  | ||||
| float dot_product(const float * a1, const float * a2, size_t test_size) { | ||||
| static float dot_product(const float * a1, const float * a2, size_t test_size) { | ||||
|     double sum = 0; | ||||
|     for (size_t i = 0; i < test_size; i++) { | ||||
|         sum += a1[i] * a2[i]; | ||||
| @@ -73,7 +73,9 @@ float dot_product(const float * a1, const float * a2, size_t test_size) { | ||||
| } | ||||
|  | ||||
| // Total dot product error | ||||
| float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) { | ||||
| static float dot_product_error( | ||||
|     ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2 | ||||
| ) { | ||||
|     std::vector<uint8_t> tmp_q1(2*test_size); | ||||
|     std::vector<uint8_t> tmp_q2(2*test_size); | ||||
|  | ||||
|   | ||||
| @@ -61,22 +61,22 @@ inline int64_t cpu_cycles() { | ||||
|  | ||||
|  | ||||
| // Generate synthetic data | ||||
| void generate_data(float offset, size_t n, float * dst) { | ||||
| static void generate_data(float offset, size_t n, float * dst) { | ||||
|     for (size_t i = 0; i < n; i++) { | ||||
|         dst[i] = 0.1 + 2*cosf(i + offset); | ||||
|     } | ||||
| } | ||||
|  | ||||
| float gigabytes_per_second(size_t bytes, int64_t usecs) { | ||||
| static float gigabytes_per_second(size_t bytes, int64_t usecs) { | ||||
|     return bytes / (float) usecs * 1000000 / (1024*1024*1024); | ||||
| } | ||||
|  | ||||
| void * align_with_offset(void * ptr, int offset) { | ||||
| static void * align_with_offset(void * ptr, int offset) { | ||||
|     size_t dummy_size = MAX_ALIGNMENT * 4; | ||||
|     return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset; | ||||
| } | ||||
|  | ||||
| void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) { | ||||
| static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) { | ||||
|     int64_t min_time_us = INT64_MAX; | ||||
|     int64_t total_time_us = 0; | ||||
|     int64_t min_time_cycles = INT64_MAX; | ||||
| @@ -108,7 +108,7 @@ void benchmark_function(size_t size, size_t q_size, int64_t iterations, const st | ||||
|     printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us)); | ||||
| } | ||||
|  | ||||
| void usage(char * argv[]) { | ||||
| static void usage(char * argv[]) { | ||||
|     printf("Benchmark quantization specific functions on synthetic data\n"); | ||||
|     printf("\n"); | ||||
|     printf("usage: %s [options]\n", argv[0]); | ||||
|   | ||||
| @@ -12,7 +12,8 @@ | ||||
| #include <vector> | ||||
| #include <algorithm> | ||||
|  | ||||
| void dump(const llama_token_data_array * candidates) { | ||||
|  | ||||
| static void dump(const llama_token_data_array * candidates) { | ||||
|     for (size_t i = 0; i < candidates->size; i++) { | ||||
|         printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit); | ||||
|     } | ||||
| @@ -21,9 +22,7 @@ void dump(const llama_token_data_array * candidates) { | ||||
| #define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0) | ||||
|  | ||||
|  | ||||
| void test_top_k(const std::vector<float> & probs, | ||||
|                 const std::vector<float> & expected_probs, | ||||
|                 int k) { | ||||
| static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) { | ||||
|     size_t n_vocab = probs.size(); | ||||
|     std::vector<llama_token_data> candidates; | ||||
|     candidates.reserve(n_vocab); | ||||
| @@ -45,10 +44,7 @@ void test_top_k(const std::vector<float> & probs, | ||||
| } | ||||
|  | ||||
|  | ||||
| void test_top_p(const std::vector<float> & probs, | ||||
|                 const std::vector<float> & expected_probs, | ||||
|                 float p) { | ||||
|  | ||||
| static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) { | ||||
|     size_t n_vocab = probs.size(); | ||||
|     std::vector<llama_token_data> candidates; | ||||
|     candidates.reserve(n_vocab); | ||||
| @@ -70,9 +66,7 @@ void test_top_p(const std::vector<float> & probs, | ||||
| } | ||||
|  | ||||
|  | ||||
| void test_tfs(const std::vector<float> & probs, | ||||
|                 const std::vector<float> & expected_probs, | ||||
|                 float z) { | ||||
| static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) { | ||||
|     size_t n_vocab = probs.size(); | ||||
|     std::vector<llama_token_data> candidates; | ||||
|     candidates.reserve(n_vocab); | ||||
| @@ -93,9 +87,7 @@ void test_tfs(const std::vector<float> & probs, | ||||
| } | ||||
|  | ||||
|  | ||||
| void test_typical(const std::vector<float> & probs, | ||||
|                 const std::vector<float> & expected_probs, | ||||
|                 float p) { | ||||
| static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) { | ||||
|     size_t n_vocab = probs.size(); | ||||
|     std::vector<llama_token_data> candidates; | ||||
|     candidates.reserve(n_vocab); | ||||
| @@ -116,11 +108,10 @@ void test_typical(const std::vector<float> & probs, | ||||
| } | ||||
|  | ||||
|  | ||||
| void test_repetition_penalty( | ||||
|                 const std::vector<float> & probs, | ||||
|                 const std::vector<llama_token> & last_tokens, | ||||
|                 const std::vector<float> & expected_probs, | ||||
|                 float penalty) { | ||||
| static void test_repetition_penalty( | ||||
|     const std::vector<float> & probs, const std::vector<llama_token> & last_tokens, | ||||
|     const std::vector<float> & expected_probs, float penalty | ||||
| ) { | ||||
|     assert(probs.size() == expected_probs.size()); | ||||
|  | ||||
|     size_t n_vocab = probs.size(); | ||||
| @@ -145,11 +136,10 @@ void test_repetition_penalty( | ||||
| } | ||||
|  | ||||
|  | ||||
| void test_frequency_presence_penalty( | ||||
|                 const std::vector<float> & probs, | ||||
|                 const std::vector<llama_token> & last_tokens, | ||||
|                 const std::vector<float> & expected_probs, | ||||
|                 float alpha_frequency, float alpha_presence) { | ||||
| static void test_frequency_presence_penalty( | ||||
|     const std::vector<float> & probs, const std::vector<llama_token> & last_tokens, | ||||
|     const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence | ||||
| ) { | ||||
|     assert(probs.size() == expected_probs.size()); | ||||
|  | ||||
|     size_t n_vocab = probs.size(); | ||||
|   | ||||
| @@ -13,7 +13,7 @@ | ||||
|  | ||||
| typedef int codepoint; | ||||
|  | ||||
| std::string codepoint_to_utf8(codepoint cp) { | ||||
| static std::string codepoint_to_utf8(codepoint cp) { | ||||
|     std::string result; | ||||
|     if (0x00 <= cp && cp <= 0x7f) { | ||||
|         result.push_back(cp); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Cebtenzzre
					Cebtenzzre