mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	llama/ggml: add LLM training support
more compact progress bar refactor: llama_prepare_sbatch/ubatch llama_save_model_to_file gqa_mode arg for repeat_back llama_opt_param_filter ggml_graph_dup force_grads refactor ggml_opt, fix test-opt
This commit is contained in:
		
				
					committed by
					
						
						Georgi Gerganov
					
				
			
			
				
	
			
			
			
						parent
						
							b34443923c
						
					
				
				
					commit
					111c9c75d6
				
			@@ -823,7 +823,7 @@ struct test_case {
 | 
			
		||||
 | 
			
		||||
        ggml_build_forward_expand(gf, out);
 | 
			
		||||
        ggml_graph_cpy(gf, gb);
 | 
			
		||||
        ggml_build_backward_expand(ctx.get(), ctx.get(), gb, false);
 | 
			
		||||
        ggml_build_backward_expand(ctx.get(), gb, nullptr);
 | 
			
		||||
        if (expect.size() != 1 || expect[0] != 0.0f) {
 | 
			
		||||
            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
 | 
			
		||||
            for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != NULL; t = ggml_get_next_tensor(ctx.get(), t)) {
 | 
			
		||||
@@ -1026,7 +1026,7 @@ struct test_example : public test_case {
 | 
			
		||||
        // Step 3: return the output tensor.
 | 
			
		||||
        return out;
 | 
			
		||||
    }
 | 
			
		||||
    // In order to also check the gradients for your op, add calls like ggml_set_param(ctx, a)
 | 
			
		||||
    // In order to also check the gradients for your op, add calls like ggml_set_param(a)
 | 
			
		||||
    // immediately after you create the tensors.
 | 
			
		||||
    // This is optional and only makes sense if a backward pass has actually been implemented for the new op.
 | 
			
		||||
};
 | 
			
		||||
@@ -1058,7 +1058,7 @@ struct test_unary : public test_case {
 | 
			
		||||
            auto ne = ne_a; ne[0] *= 3;
 | 
			
		||||
            a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
            if (grad_supported) {
 | 
			
		||||
                ggml_set_param(ctx, a);
 | 
			
		||||
                ggml_set_param(a);
 | 
			
		||||
            }
 | 
			
		||||
            ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
@@ -1067,7 +1067,7 @@ struct test_unary : public test_case {
 | 
			
		||||
        } else {
 | 
			
		||||
            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
 | 
			
		||||
            if (grad_supported) {
 | 
			
		||||
                ggml_set_param(ctx, a);
 | 
			
		||||
                ggml_set_param(a);
 | 
			
		||||
            }
 | 
			
		||||
            ggml_set_name(a, "a");
 | 
			
		||||
        }
 | 
			
		||||
@@ -1133,7 +1133,7 @@ struct test_get_rows : public test_case {
 | 
			
		||||
 | 
			
		||||
        const bool grad_supported = ggml_is_matrix(in) && ggml_is_vector(rows);
 | 
			
		||||
        if (grad_supported) {
 | 
			
		||||
            ggml_set_param(ctx, in);
 | 
			
		||||
            ggml_set_param(in);
 | 
			
		||||
            // rows is a constant input -> no gradients
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@@ -1322,7 +1322,7 @@ struct test_repeat : public test_case {
 | 
			
		||||
        ggml_set_name(target, "target");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, src);
 | 
			
		||||
        ggml_set_param(src);
 | 
			
		||||
        ggml_set_name(src, "src");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_repeat(ctx, src, target);
 | 
			
		||||
@@ -1406,7 +1406,7 @@ struct test_dup : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, src);
 | 
			
		||||
        ggml_set_param(src);
 | 
			
		||||
        ggml_set_name(src, "src");
 | 
			
		||||
 | 
			
		||||
        if (_use_permute) {
 | 
			
		||||
@@ -1442,7 +1442,7 @@ struct test_set : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, src);
 | 
			
		||||
        ggml_set_param(src);
 | 
			
		||||
        ggml_set_name(src, "src");
 | 
			
		||||
 | 
			
		||||
        auto ne_dst = ne;
 | 
			
		||||
@@ -1450,7 +1450,7 @@ struct test_set : public test_case {
 | 
			
		||||
            ne_dst[i] *= 2;
 | 
			
		||||
        }
 | 
			
		||||
        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, ne_dst.data());
 | 
			
		||||
        ggml_set_param(ctx, dst);
 | 
			
		||||
        ggml_set_param(dst);
 | 
			
		||||
        ggml_set_name(dst, "dst");
 | 
			
		||||
 | 
			
		||||
        size_t offset = 0;
 | 
			
		||||
@@ -1498,7 +1498,7 @@ struct test_cpy : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, src);
 | 
			
		||||
        ggml_set_param(src);
 | 
			
		||||
        ggml_set_name(src, "src");
 | 
			
		||||
 | 
			
		||||
        if (_src_use_permute) {
 | 
			
		||||
@@ -1536,7 +1536,7 @@ struct test_cont : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, src);
 | 
			
		||||
        ggml_set_param(src);
 | 
			
		||||
        ggml_set_name(src, "src");
 | 
			
		||||
 | 
			
		||||
        src = ggml_transpose(ctx, src);
 | 
			
		||||
@@ -1583,8 +1583,8 @@ struct test_bin_bcast : public test_case {
 | 
			
		||||
        // The backward pass supports broadcasting only for GGML_ADD:
 | 
			
		||||
        const bool grad_supported = op == ggml_add || ggml_are_same_shape(a, b);
 | 
			
		||||
        if (grad_supported) {
 | 
			
		||||
            ggml_set_param(ctx, a);
 | 
			
		||||
            ggml_set_param(ctx, b);
 | 
			
		||||
            ggml_set_param(a);
 | 
			
		||||
            ggml_set_param(b);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = op(ctx, a, b);
 | 
			
		||||
@@ -1632,11 +1632,11 @@ struct test_add1 : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
 | 
			
		||||
        // ggml_set_param(ctx, b); // TODO: implement
 | 
			
		||||
        // ggml_set_param(b); // TODO: implement
 | 
			
		||||
        ggml_set_name(b, "b");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_add1(ctx, a, b);
 | 
			
		||||
@@ -1667,7 +1667,7 @@ struct test_scale : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_scale(ctx, a, scale);
 | 
			
		||||
@@ -1762,7 +1762,7 @@ struct test_rms_norm : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        if (v) {
 | 
			
		||||
@@ -2028,9 +2028,9 @@ struct test_mul_mat : public test_case {
 | 
			
		||||
            b = ggml_new_tensor_4d(ctx, type_b, ne_b[per[0]], ne_b[per[1]], ne_b[per[2]], ne_b[per[3]]);
 | 
			
		||||
            if (!ggml_is_quantized(type_a)) {
 | 
			
		||||
                if (bs[1] == 1 && nr[1] == 1) {
 | 
			
		||||
                    ggml_set_param(ctx, a);
 | 
			
		||||
                    ggml_set_param(a);
 | 
			
		||||
                }
 | 
			
		||||
                ggml_set_param(ctx, b);
 | 
			
		||||
                ggml_set_param(b);
 | 
			
		||||
            }
 | 
			
		||||
            ggml_set_name(a, "a");
 | 
			
		||||
            ggml_set_name(b, "b");
 | 
			
		||||
@@ -2053,9 +2053,9 @@ struct test_mul_mat : public test_case {
 | 
			
		||||
            }
 | 
			
		||||
            if (!ggml_is_quantized(type_a)) {
 | 
			
		||||
                if (bs[1] == 1 && nr[1] == 1) {
 | 
			
		||||
                    ggml_set_param(ctx, a);
 | 
			
		||||
                    ggml_set_param(a);
 | 
			
		||||
                }
 | 
			
		||||
                ggml_set_param(ctx, b);
 | 
			
		||||
                ggml_set_param(b);
 | 
			
		||||
            }
 | 
			
		||||
            ggml_set_name(a, "a");
 | 
			
		||||
            ggml_set_name(b, "b");
 | 
			
		||||
@@ -2204,7 +2204,7 @@ struct test_sqr : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_sqr(ctx, a);
 | 
			
		||||
@@ -2233,7 +2233,7 @@ struct test_sqrt : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_sqrt(ctx, a);
 | 
			
		||||
@@ -2273,7 +2273,7 @@ struct test_log : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_log(ctx, a);
 | 
			
		||||
@@ -2309,7 +2309,7 @@ struct test_sin : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_sin(ctx, a);
 | 
			
		||||
@@ -2352,7 +2352,7 @@ struct test_cos : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_cos(ctx, a);
 | 
			
		||||
@@ -2432,7 +2432,7 @@ struct test_diag_mask_inf : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
 | 
			
		||||
@@ -2471,7 +2471,7 @@ struct test_soft_max : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * mask = nullptr;
 | 
			
		||||
@@ -2553,7 +2553,7 @@ struct test_rope : public test_case {
 | 
			
		||||
            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
 | 
			
		||||
            a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
            if (forward) {
 | 
			
		||||
                ggml_set_param(ctx, a);
 | 
			
		||||
                ggml_set_param(a);
 | 
			
		||||
            }
 | 
			
		||||
            ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
@@ -2562,7 +2562,7 @@ struct test_rope : public test_case {
 | 
			
		||||
        } else {
 | 
			
		||||
            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
 | 
			
		||||
            if (forward) {
 | 
			
		||||
                ggml_set_param(ctx, a);
 | 
			
		||||
                ggml_set_param(a);
 | 
			
		||||
            }
 | 
			
		||||
            ggml_set_name(a, "a");
 | 
			
		||||
        }
 | 
			
		||||
@@ -2676,7 +2676,7 @@ struct test_pool2d : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
 | 
			
		||||
        ggml_set_param(ctx, input);
 | 
			
		||||
        ggml_set_param(input);
 | 
			
		||||
        ggml_set_name(input, "input");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
 | 
			
		||||
@@ -2752,7 +2752,7 @@ struct test_im2col : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
 | 
			
		||||
        ggml_set_param(ctx, input);
 | 
			
		||||
        ggml_set_param(input);
 | 
			
		||||
        ggml_set_name(input, "input");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
 | 
			
		||||
@@ -2929,7 +2929,7 @@ struct test_sum : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_sum(ctx, a);
 | 
			
		||||
@@ -2958,7 +2958,7 @@ struct test_sum_rows : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_sum_rows(ctx, a);
 | 
			
		||||
@@ -2983,7 +2983,7 @@ struct test_mean : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_mean(ctx, a);
 | 
			
		||||
@@ -3129,11 +3129,11 @@ struct test_acc : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
 | 
			
		||||
        ggml_set_param(ctx, a);
 | 
			
		||||
        ggml_set_param(a);
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
 | 
			
		||||
        ggml_set_param(ctx, b);
 | 
			
		||||
        ggml_set_param(b);
 | 
			
		||||
        ggml_set_name(b, "b");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
 | 
			
		||||
@@ -3370,7 +3370,7 @@ struct test_cross_entropy_loss : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * logits = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
        ggml_set_param(ctx, logits);
 | 
			
		||||
        ggml_set_param(logits);
 | 
			
		||||
        ggml_set_name(logits, "logits");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * labels = ggml_new_tensor(ctx, type, 4, ne.data());
 | 
			
		||||
@@ -3452,7 +3452,7 @@ struct test_opt_step_adamw : public test_case {
 | 
			
		||||
 | 
			
		||||
    ggml_tensor * build_graph(ggml_context * ctx) override {
 | 
			
		||||
        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
 | 
			
		||||
        ggml_set_param(ctx, a); // Despite tensor a having gradients the output tensor will not.
 | 
			
		||||
        ggml_set_param(a); // Despite tensor a having gradients the output tensor will not.
 | 
			
		||||
        ggml_set_name(a, "a");
 | 
			
		||||
 | 
			
		||||
        ggml_tensor * grad = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2], ne[3]);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user