mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	 3198405e98
			
		
	
	3198405e98
	
	
	
		
			
			* move string_find_partial_stop & string_ends_with to common * add common_regex (supports partial matches) Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/regex-partial.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/regex-partial.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/regex-partial.h Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * partial regex: add missing iterator end checks * string utils: use string_views * direct throw to avoid ggml.h include * regex-partial: replace missed ggml_asserts --------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
		
			
				
	
	
		
			289 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			289 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| //  Tests common_regex (esp. its partial final matches support).
 | |
| 
 | |
| #include "common.h"
 | |
| #include "regex-partial.h"
 | |
| 
 | |
| #include <sstream>
 | |
| #include <iostream>
 | |
| #include <optional>
 | |
| 
 | |
| template <class T> static void assert_equals(const T & expected, const T & actual) {
 | |
|     if (expected != actual) {
 | |
|         std::cerr << "Expected: " << expected << std::endl;
 | |
|         std::cerr << "  Actual: " << actual << std::endl;
 | |
|         std::cerr << std::flush;
 | |
|         throw std::runtime_error("Test failed");
 | |
|     }
 | |
| }
 | |
| 
 | |
| struct test_case {
 | |
|     std::string pattern;
 | |
|     struct input_output {
 | |
|         std::string input;
 | |
|         common_regex_match output;
 | |
|     };
 | |
|     std::vector<input_output> inputs_outputs;
 | |
| };
 | |
| 
 | |
| static std::string common_regex_match_type_name(common_regex_match_type type) {
 | |
|     switch (type) {
 | |
|         case COMMON_REGEX_MATCH_TYPE_NONE:
 | |
|             return "COMMON_REGEX_MATCH_TYPE_NONE";
 | |
|         case COMMON_REGEX_MATCH_TYPE_PARTIAL:
 | |
|             return "COMMON_REGEX_MATCH_TYPE_PARTIAL";
 | |
|         case COMMON_REGEX_MATCH_TYPE_FULL:
 | |
|             return "COMMON_REGEX_MATCH_TYPE_FULL";
 | |
|     }
 | |
|     return "?";
 | |
| }
 | |
| 
 | |
| static void test_regex() {
 | |
|     printf("[%s]\n", __func__);
 | |
|     auto test = [](const test_case & test_case) {
 | |
|         common_regex cr(test_case.pattern);
 | |
|         std::cout << "Testing pattern: /" << test_case.pattern << "/\n";
 | |
|         // std::cout << "    partial rev: " << cr.reversed_partial_pattern.str() << '\n';
 | |
|         for (const auto & input_output : test_case.inputs_outputs) {
 | |
|             std::cout << "  Input: " << input_output.input << '\n';
 | |
|             auto m = cr.search(input_output.input, 0);
 | |
|             if (m != input_output.output) {
 | |
|                 auto match_to_str = [&](const std::optional<common_regex_match> & m) {
 | |
|                     std::ostringstream ss;
 | |
|                     if (m->type == COMMON_REGEX_MATCH_TYPE_NONE) {
 | |
|                         ss << "<no match>";
 | |
|                     } else {
 | |
|                         GGML_ASSERT(!input_output.output.groups.empty());
 | |
|                         std::vector<std::string> parts;
 | |
|                         for (const auto & g : m->groups) {
 | |
|                             parts.push_back("{" + std::to_string(g.begin) + ", " + std::to_string(g.end) + "}");
 | |
|                         }
 | |
|                         ss << "{" << common_regex_match_type_name(m->type) << ", {" << string_join(parts, ", ") << "}}";
 | |
|                     }
 | |
|                     return ss.str();
 | |
|                 };
 | |
|                 std::cout << "    Expected: " << match_to_str(input_output.output) << '\n';
 | |
|                 std::cout << "         Got: " << match_to_str(m) << '\n';
 | |
|                 std::cout << " Inverted pattern: /" << regex_to_reversed_partial_regex(test_case.pattern) << "/\n";
 | |
| 
 | |
|                 throw std::runtime_error("Test failed");
 | |
|             }
 | |
|         }
 | |
|     };
 | |
|     test({
 | |
|         "a",
 | |
|         {
 | |
|             {"a", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
 | |
|             {"b", {COMMON_REGEX_MATCH_TYPE_NONE, {}}},
 | |
|             {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
 | |
|             {"ba", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 2}}}},
 | |
|         }
 | |
|     });
 | |
|     test({
 | |
|         "abcd",
 | |
|         {
 | |
|             {"abcd", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
 | |
|             {"abcde", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
 | |
|             {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
 | |
|             {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | |
|             {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | |
|             {"d", {}},
 | |
|             {"bcd", {}},
 | |
|             {"cde", {}},
 | |
|             {"cd", {}},
 | |
|             {"yeah ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{5, 7}}}},
 | |
|             {"abbie", {}},
 | |
|             {"", {}},
 | |
|         }
 | |
|     });
 | |
|     test({
 | |
|         ".*?ab",
 | |
|         {
 | |
|             {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | |
|             {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | |
|             {"dab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | |
|             {"dabc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | |
|             {"da", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | |
|             {"d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | |
|         }
 | |
|     });
 | |
|     test({
 | |
|         "a.*?b",
 | |
|         {
 | |
|             {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | |
|             {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | |
|             {"a b", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | |
|             {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | |
|             {"argh", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
 | |
|             {"d", {}},
 | |
|             {"b", {}},
 | |
|         }
 | |
|     });
 | |
|     test({
 | |
|         "ab(?:cd){2,4}ef",
 | |
|         {
 | |
|             // {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, 0, {}}},
 | |
|             {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | |
|             {"abcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
 | |
|             {"abcde", {}},
 | |
|             {"abcdef", {}},
 | |
|             {"abcdcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | |
|             {"abcdcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 7}}}},
 | |
|             {"abcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
 | |
|             {"abcdcdcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 12}}}},
 | |
|             {"abcdcdcdcdcdef", {}},
 | |
|             {"abcde", {}},
 | |
|             {"yea", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{2, 3}}}},
 | |
|         }
 | |
|     });
 | |
|     test({
 | |
|         "a(?:rte| pure )fact",
 | |
|         {
 | |
|             {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | |
|             {"art", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
 | |
|             {"artefa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | |
|             {"fact", {}},
 | |
|             {"an arte", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{3, 7}}}},
 | |
|             {"artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
 | |
|             {"an artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{3, 11}}}},
 | |
|             {"a pure", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | |
|             {"a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 11}}}},
 | |
|             {"it's a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{5, 16}}}},
 | |
|             {"" , {}},
 | |
|             {"pure", {}},
 | |
|             {"pure fact", {}},
 | |
|         }
 | |
|     });
 | |
|     test({
 | |
|         "abc",
 | |
|         {
 | |
|             {" abcc", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 4}}}},
 | |
|             {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | |
|             {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | |
|             {" ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{1, 3}}}},
 | |
|             {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | |
|             {"b", {}},
 | |
|             {"c", {}},
 | |
|             {"", {}},
 | |
|         }
 | |
|     });
 | |
| 
 | |
|     test({
 | |
|         "(?:abc)?\\s*def",
 | |
|         {
 | |
|             {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | |
|             {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
 | |
|             {"abc ", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
 | |
|             {"abc d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
 | |
|             {"abc de", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | |
|             {"abc def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
 | |
|             {"abc defg", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
 | |
|             {"abc defgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
 | |
|             {"abcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
 | |
|             {"abcdefgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 6}}}},
 | |
|             {" d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | |
|             {"def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | |
|         }
 | |
|     });
 | |
| 
 | |
|     test({
 | |
|         "a+b",
 | |
|         {
 | |
|             {"aaab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
 | |
|             {"aaa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
 | |
|             {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | |
|         }
 | |
|     });
 | |
| 
 | |
|     test({
 | |
|         "(?:"
 | |
|             "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
 | |
|             "("                          // match 2 (open_tag)
 | |
|                 "<tool_call>"
 | |
|                 "|<function_call>"
 | |
|                 "|<tool>"
 | |
|                 "|<tools>"
 | |
|                 "|<response>"
 | |
|                 "|<json>"
 | |
|                 "|<xml>"
 | |
|                 "|<JSON>"
 | |
|             ")?"
 | |
|             "(\\s*\\{\\s*\"name\"\\s*:)" // match 3 (named tool call)
 | |
|         ")"
 | |
|         "|<function=([^>]+)>"            // match 4 (function name)
 | |
|         "|<function name=\"([^\"]+)\">", // match 5 (function name again)
 | |
|         {
 | |
|             {"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}, {54, 54}, {54, 54}, {0, 8}, {54, 54}, {54, 54}}}},
 | |
|             {"<tool_call> {\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 18}}}},
 | |
|             {"<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 17}}}},
 | |
|             {"Let's call something\n<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{21, 38}}}},
 | |
|             {"Ok then<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 24}}}},
 | |
|             {"{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | |
|             {"Ok then{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 13}}}},
 | |
|             {"<tool_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 20}, {66, 66}, {0, 11}, {11, 20}, {66, 66}, {66, 66}}}},
 | |
|             {"<function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 24}, {70, 70}, {0, 15}, {15, 24}, {70, 70}, {70, 70}}}},
 | |
|             {"<function name=\"special_function\"> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 34}, {89, 89}, {89, 89}, {89, 89}, {89, 89}, {16, 32}}}},
 | |
|             {"<function=all>", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 14}, {14, 14}, {14, 14}, {14, 14}, {10, 13}, {14, 14}}}},
 | |
| 
 | |
|         }
 | |
|     });
 | |
| }
 | |
| 
 | |
| static void test_regex_to_reversed_partial_regex() {
 | |
|     printf("[%s]\n", __func__);
 | |
| 
 | |
|     assert_equals<std::string>(
 | |
|         "((?:(?:c)?b)?a)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("abc"));
 | |
| 
 | |
|     assert_equals<std::string>(
 | |
|         "(a+)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("a+"));
 | |
| 
 | |
|     assert_equals<std::string>(
 | |
|         "(a*)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("a*"));
 | |
| 
 | |
|     assert_equals<std::string>(
 | |
|         "(a?)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("a?"));
 | |
| 
 | |
|     assert_equals<std::string>(
 | |
|         "([a-z])[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("[a-z]"));
 | |
| 
 | |
|     assert_equals<std::string>(
 | |
|         "((?:\\w+)?[a-z])[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("[a-z]\\w+"));
 | |
| 
 | |
|     assert_equals<std::string>(
 | |
|         "((?:a|b))[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("(?:a|b)"));
 | |
|     assert_equals<std::string>(
 | |
|         "((?:(?:(?:d)?c)?b)?a)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("abcd"));
 | |
|     assert_equals<std::string>(
 | |
|         "((?:b)?a*)[\\s\\S]*", // TODO: ((?:b)?a*+).* ??
 | |
|         regex_to_reversed_partial_regex("a*b"));
 | |
|     assert_equals<std::string>(
 | |
|         "((?:(?:b)?a)?.*)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex(".*?ab"));
 | |
|     assert_equals<std::string>(
 | |
|         "((?:(?:b)?.*)?a)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("a.*?b"));
 | |
|     assert_equals<std::string>(
 | |
|         "((?:(?:d)?(?:(?:c)?b))?a)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("a(bc)d"));
 | |
|     assert_equals<std::string>(
 | |
|         "((?:(?:(?:c)?b|(?:e)?d))?a)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("a(bc|de)"));
 | |
|     assert_equals<std::string>(
 | |
|         "((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)[\\s\\S]*",
 | |
|         regex_to_reversed_partial_regex("ab{2,4}c"));
 | |
| }
 | |
| 
 | |
| int main() {
 | |
|     test_regex_to_reversed_partial_regex();
 | |
|     test_regex();
 | |
|     std::cout << "All tests passed.\n";
 | |
| }
 |