mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	* move string_find_partial_stop & string_ends_with to common * add common_regex (supports partial matches) Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/regex-partial.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/regex-partial.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/regex-partial.h Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * partial regex: add missing iterator end checks * string utils: use string_views * direct throw to avoid ggml.h include * regex-partial: replace missed ggml_asserts --------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
		
			
				
	
	
		
			289 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			289 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
//  Tests common_regex (esp. its partial final matches support).
 | 
						|
 | 
						|
#include "common.h"
 | 
						|
#include "regex-partial.h"
 | 
						|
 | 
						|
#include <sstream>
 | 
						|
#include <iostream>
 | 
						|
#include <optional>
 | 
						|
 | 
						|
template <class T> static void assert_equals(const T & expected, const T & actual) {
 | 
						|
    if (expected != actual) {
 | 
						|
        std::cerr << "Expected: " << expected << std::endl;
 | 
						|
        std::cerr << "  Actual: " << actual << std::endl;
 | 
						|
        std::cerr << std::flush;
 | 
						|
        throw std::runtime_error("Test failed");
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
struct test_case {
 | 
						|
    std::string pattern;
 | 
						|
    struct input_output {
 | 
						|
        std::string input;
 | 
						|
        common_regex_match output;
 | 
						|
    };
 | 
						|
    std::vector<input_output> inputs_outputs;
 | 
						|
};
 | 
						|
 | 
						|
static std::string common_regex_match_type_name(common_regex_match_type type) {
 | 
						|
    switch (type) {
 | 
						|
        case COMMON_REGEX_MATCH_TYPE_NONE:
 | 
						|
            return "COMMON_REGEX_MATCH_TYPE_NONE";
 | 
						|
        case COMMON_REGEX_MATCH_TYPE_PARTIAL:
 | 
						|
            return "COMMON_REGEX_MATCH_TYPE_PARTIAL";
 | 
						|
        case COMMON_REGEX_MATCH_TYPE_FULL:
 | 
						|
            return "COMMON_REGEX_MATCH_TYPE_FULL";
 | 
						|
    }
 | 
						|
    return "?";
 | 
						|
}
 | 
						|
 | 
						|
static void test_regex() {
 | 
						|
    printf("[%s]\n", __func__);
 | 
						|
    auto test = [](const test_case & test_case) {
 | 
						|
        common_regex cr(test_case.pattern);
 | 
						|
        std::cout << "Testing pattern: /" << test_case.pattern << "/\n";
 | 
						|
        // std::cout << "    partial rev: " << cr.reversed_partial_pattern.str() << '\n';
 | 
						|
        for (const auto & input_output : test_case.inputs_outputs) {
 | 
						|
            std::cout << "  Input: " << input_output.input << '\n';
 | 
						|
            auto m = cr.search(input_output.input, 0);
 | 
						|
            if (m != input_output.output) {
 | 
						|
                auto match_to_str = [&](const std::optional<common_regex_match> & m) {
 | 
						|
                    std::ostringstream ss;
 | 
						|
                    if (m->type == COMMON_REGEX_MATCH_TYPE_NONE) {
 | 
						|
                        ss << "<no match>";
 | 
						|
                    } else {
 | 
						|
                        GGML_ASSERT(!input_output.output.groups.empty());
 | 
						|
                        std::vector<std::string> parts;
 | 
						|
                        for (const auto & g : m->groups) {
 | 
						|
                            parts.push_back("{" + std::to_string(g.begin) + ", " + std::to_string(g.end) + "}");
 | 
						|
                        }
 | 
						|
                        ss << "{" << common_regex_match_type_name(m->type) << ", {" << string_join(parts, ", ") << "}}";
 | 
						|
                    }
 | 
						|
                    return ss.str();
 | 
						|
                };
 | 
						|
                std::cout << "    Expected: " << match_to_str(input_output.output) << '\n';
 | 
						|
                std::cout << "         Got: " << match_to_str(m) << '\n';
 | 
						|
                std::cout << " Inverted pattern: /" << regex_to_reversed_partial_regex(test_case.pattern) << "/\n";
 | 
						|
 | 
						|
                throw std::runtime_error("Test failed");
 | 
						|
            }
 | 
						|
        }
 | 
						|
    };
 | 
						|
    test({
 | 
						|
        "a",
 | 
						|
        {
 | 
						|
            {"a", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
 | 
						|
            {"b", {COMMON_REGEX_MATCH_TYPE_NONE, {}}},
 | 
						|
            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
 | 
						|
            {"ba", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 2}}}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
    test({
 | 
						|
        "abcd",
 | 
						|
        {
 | 
						|
            {"abcd", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
 | 
						|
            {"abcde", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
 | 
						|
            {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
 | 
						|
            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | 
						|
            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | 
						|
            {"d", {}},
 | 
						|
            {"bcd", {}},
 | 
						|
            {"cde", {}},
 | 
						|
            {"cd", {}},
 | 
						|
            {"yeah ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{5, 7}}}},
 | 
						|
            {"abbie", {}},
 | 
						|
            {"", {}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
    test({
 | 
						|
        ".*?ab",
 | 
						|
        {
 | 
						|
            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | 
						|
            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | 
						|
            {"dab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | 
						|
            {"dabc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | 
						|
            {"da", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | 
						|
            {"d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
    test({
 | 
						|
        "a.*?b",
 | 
						|
        {
 | 
						|
            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | 
						|
            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | 
						|
            {"a b", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | 
						|
            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | 
						|
            {"argh", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
 | 
						|
            {"d", {}},
 | 
						|
            {"b", {}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
    test({
 | 
						|
        "ab(?:cd){2,4}ef",
 | 
						|
        {
 | 
						|
            // {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, 0, {}}},
 | 
						|
            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | 
						|
            {"abcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
 | 
						|
            {"abcde", {}},
 | 
						|
            {"abcdef", {}},
 | 
						|
            {"abcdcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | 
						|
            {"abcdcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 7}}}},
 | 
						|
            {"abcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
 | 
						|
            {"abcdcdcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 12}}}},
 | 
						|
            {"abcdcdcdcdcdef", {}},
 | 
						|
            {"abcde", {}},
 | 
						|
            {"yea", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{2, 3}}}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
    test({
 | 
						|
        "a(?:rte| pure )fact",
 | 
						|
        {
 | 
						|
            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | 
						|
            {"art", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
 | 
						|
            {"artefa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | 
						|
            {"fact", {}},
 | 
						|
            {"an arte", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{3, 7}}}},
 | 
						|
            {"artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
 | 
						|
            {"an artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{3, 11}}}},
 | 
						|
            {"a pure", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | 
						|
            {"a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 11}}}},
 | 
						|
            {"it's a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{5, 16}}}},
 | 
						|
            {"" , {}},
 | 
						|
            {"pure", {}},
 | 
						|
            {"pure fact", {}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
    test({
 | 
						|
        "abc",
 | 
						|
        {
 | 
						|
            {" abcc", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 4}}}},
 | 
						|
            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | 
						|
            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | 
						|
            {" ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{1, 3}}}},
 | 
						|
            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
 | 
						|
            {"b", {}},
 | 
						|
            {"c", {}},
 | 
						|
            {"", {}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
 | 
						|
    test({
 | 
						|
        "(?:abc)?\\s*def",
 | 
						|
        {
 | 
						|
            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | 
						|
            {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
 | 
						|
            {"abc ", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
 | 
						|
            {"abc d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
 | 
						|
            {"abc de", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | 
						|
            {"abc def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
 | 
						|
            {"abc defg", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
 | 
						|
            {"abc defgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
 | 
						|
            {"abcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
 | 
						|
            {"abcdefgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 6}}}},
 | 
						|
            {" d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
 | 
						|
            {"def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
 | 
						|
    test({
 | 
						|
        "a+b",
 | 
						|
        {
 | 
						|
            {"aaab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
 | 
						|
            {"aaa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
 | 
						|
            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
 | 
						|
        }
 | 
						|
    });
 | 
						|
 | 
						|
    test({
 | 
						|
        "(?:"
 | 
						|
            "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
 | 
						|
            "("                          // match 2 (open_tag)
 | 
						|
                "<tool_call>"
 | 
						|
                "|<function_call>"
 | 
						|
                "|<tool>"
 | 
						|
                "|<tools>"
 | 
						|
                "|<response>"
 | 
						|
                "|<json>"
 | 
						|
                "|<xml>"
 | 
						|
                "|<JSON>"
 | 
						|
            ")?"
 | 
						|
            "(\\s*\\{\\s*\"name\"\\s*:)" // match 3 (named tool call)
 | 
						|
        ")"
 | 
						|
        "|<function=([^>]+)>"            // match 4 (function name)
 | 
						|
        "|<function name=\"([^\"]+)\">", // match 5 (function name again)
 | 
						|
        {
 | 
						|
            {"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}, {54, 54}, {54, 54}, {0, 8}, {54, 54}, {54, 54}}}},
 | 
						|
            {"<tool_call> {\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 18}}}},
 | 
						|
            {"<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 17}}}},
 | 
						|
            {"Let's call something\n<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{21, 38}}}},
 | 
						|
            {"Ok then<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 24}}}},
 | 
						|
            {"{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
 | 
						|
            {"Ok then{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 13}}}},
 | 
						|
            {"<tool_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 20}, {66, 66}, {0, 11}, {11, 20}, {66, 66}, {66, 66}}}},
 | 
						|
            {"<function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 24}, {70, 70}, {0, 15}, {15, 24}, {70, 70}, {70, 70}}}},
 | 
						|
            {"<function name=\"special_function\"> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 34}, {89, 89}, {89, 89}, {89, 89}, {89, 89}, {16, 32}}}},
 | 
						|
            {"<function=all>", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 14}, {14, 14}, {14, 14}, {14, 14}, {10, 13}, {14, 14}}}},
 | 
						|
 | 
						|
        }
 | 
						|
    });
 | 
						|
}
 | 
						|
 | 
						|
static void test_regex_to_reversed_partial_regex() {
 | 
						|
    printf("[%s]\n", __func__);
 | 
						|
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:(?:c)?b)?a)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("abc"));
 | 
						|
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "(a+)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("a+"));
 | 
						|
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "(a*)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("a*"));
 | 
						|
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "(a?)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("a?"));
 | 
						|
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "([a-z])[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("[a-z]"));
 | 
						|
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:\\w+)?[a-z])[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("[a-z]\\w+"));
 | 
						|
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:a|b))[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("(?:a|b)"));
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:(?:(?:d)?c)?b)?a)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("abcd"));
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:b)?a*)[\\s\\S]*", // TODO: ((?:b)?a*+).* ??
 | 
						|
        regex_to_reversed_partial_regex("a*b"));
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:(?:b)?a)?.*)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex(".*?ab"));
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:(?:b)?.*)?a)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("a.*?b"));
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:(?:d)?(?:(?:c)?b))?a)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("a(bc)d"));
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:(?:(?:c)?b|(?:e)?d))?a)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("a(bc|de)"));
 | 
						|
    assert_equals<std::string>(
 | 
						|
        "((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)[\\s\\S]*",
 | 
						|
        regex_to_reversed_partial_regex("ab{2,4}c"));
 | 
						|
}
 | 
						|
 | 
						|
int main() {
 | 
						|
    test_regex_to_reversed_partial_regex();
 | 
						|
    test_regex();
 | 
						|
    std::cout << "All tests passed.\n";
 | 
						|
}
 |