Merge branch 'master' into compilade/test-model-random

This commit is contained in:
Francis Couture-Harpin
2025-06-13 14:31:39 -04:00
28 changed files with 499 additions and 359 deletions

View File

@@ -49,19 +49,23 @@ COPY --from=build /app/full /app
WORKDIR /app WORKDIR /app
RUN apt-get update \ RUN apt-get update && \
&& apt-get install -y \ apt-get install -y \
git \ git \
python3 \ python3 \
python3-pip \ python3-pip \
&& pip install --upgrade pip setuptools wheel \ python3-venv && \
&& pip install -r requirements.txt \ python3 -m venv /opt/venv && \
&& apt autoremove -y \ . /opt/venv/bin/activate && \
&& apt clean -y \ pip install --upgrade pip setuptools wheel && \
&& rm -rf /tmp/* /var/tmp/* \ pip install -r requirements.txt && \
&& find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ apt autoremove -y && \
&& find /var/cache -type f -delete apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete
ENV PATH="/opt/venv/bin:$PATH"
ENTRYPOINT ["/app/tools.sh"] ENTRYPOINT ["/app/tools.sh"]

View File

@@ -89,6 +89,14 @@ option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
if (NOT DEFINED LLAMA_BUILD_NUMBER)
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
endif()
if (NOT DEFINED LLAMA_BUILD_COMMIT)
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
endif()
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
# override ggml options # override ggml options
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
@@ -155,6 +163,8 @@ if (LLAMA_USE_SYSTEM_GGML)
endif() endif()
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
set(GGML_BUILD_NUMBER ${LLAMA_BUILD_NUMBER})
set(GGML_BUILD_COMMIT ${LLAMA_BUILD_COMMIT})
add_subdirectory(ggml) add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt # ... otherwise assume ggml is added by a parent CMakeLists.txt
endif() endif()
@@ -204,10 +214,6 @@ endif()
include(GNUInstallDirs) include(GNUInstallDirs)
include(CMakePackageConfigHelpers) include(CMakePackageConfigHelpers)
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files") set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")

View File

@@ -6,7 +6,7 @@
[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases) [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml) [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
@@ -18,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics ## Hot topics
- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md) - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639

View File

@@ -23,31 +23,21 @@ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
endif() endif()
if(EXISTS "${GIT_DIR}/index") if(EXISTS "${GIT_DIR}/index")
set(GIT_INDEX "${GIT_DIR}/index") # For build-info.cpp below
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
else() else()
message(WARNING "Git index not found in git repository.") message(WARNING "Git index not found in git repository.")
set(GIT_INDEX "")
endif() endif()
else() else()
message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
set(GIT_INDEX "")
endif() endif()
# Add a custom command to rebuild build-info.cpp when .git/index changes set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
add_custom_command( set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp" configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
COMMENT "Generating build details from Git"
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
VERBATIM
)
set(TARGET build_info) set(TARGET build_info)
add_library(${TARGET} OBJECT build-info.cpp) add_library(${TARGET} OBJECT ${OUTPUT_FILE})
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif() endif()

View File

@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@; int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
char const *LLAMA_COMMIT = "@BUILD_COMMIT@"; char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
char const *LLAMA_COMPILER = "@BUILD_COMPILER@"; char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@"; char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";

View File

@@ -1,24 +0,0 @@
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
# Only write the build info if it changed
if(EXISTS ${OUTPUT_FILE})
file(READ ${OUTPUT_FILE} CONTENTS)
string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_COMMIT ${CMAKE_MATCH_1})
string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_COMPILER ${CMAKE_MATCH_1})
string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
set(OLD_TARGET ${CMAKE_MATCH_1})
if (
NOT OLD_COMMIT STREQUAL BUILD_COMMIT OR
NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
NOT OLD_TARGET STREQUAL BUILD_TARGET
)
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
endif()
else()
configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
endif()

View File

@@ -107,3 +107,7 @@ NOTE: some models may require large context window, for example: `-c 8192`
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF (tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF (tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
``` ```
## Finding more models:
GGUF models on Huggingface with vision capabilities can be found here: https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending&search=gguf

View File

@@ -44,21 +44,22 @@ if (GGML_METAL_EMBED_LIBRARY)
set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp") set(METALLIB_SOURCE_EMBED_TMP "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal.tmp")
add_custom_command( add_custom_command(
OUTPUT ${METALLIB_EMBED_ASM} OUTPUT "${METALLIB_EMBED_ASM}"
COMMAND echo "Embedding Metal library" COMMAND echo "Embedding Metal library"
COMMAND sed -e '/__embed_ggml-common.h__/r ${METALLIB_COMMON}' -e '/__embed_ggml-common.h__/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED_TMP} COMMAND sed -e "/__embed_ggml-common.h__/r ${METALLIB_COMMON}" -e "/__embed_ggml-common.h__/d" < "${METALLIB_SOURCE}" > "${METALLIB_SOURCE_EMBED_TMP}"
COMMAND sed -e '/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}' -e '/\#include \"ggml-metal-impl.h\"/d' < ${METALLIB_SOURCE_EMBED_TMP} > ${METALLIB_SOURCE_EMBED} COMMAND sed -e "/\#include \"ggml-metal-impl.h\"/r ${METALLIB_IMPL}" -e "/\#include \"ggml-metal-impl.h\"/d" < "${METALLIB_SOURCE_EMBED_TMP}" > "${METALLIB_SOURCE_EMBED}"
COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}"
COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}"
COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}"
COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} COMMAND echo .incbin "\"${METALLIB_SOURCE_EMBED}\"" >> "${METALLIB_EMBED_ASM}"
COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}"
COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}"
DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h
COMMENT "Generate assembly for embedded Metal library" COMMENT "Generate assembly for embedded Metal library"
VERBATIM
) )
target_sources(ggml-metal PRIVATE ${METALLIB_EMBED_ASM}) target_sources(ggml-metal PRIVATE "${METALLIB_EMBED_ASM}")
else() else()
if (GGML_METAL_SHADER_DEBUG) if (GGML_METAL_SHADER_DEBUG)
# custom command to do the following: # custom command to do the following:

View File

@@ -142,7 +142,7 @@ else()
FetchContent_Declare( FetchContent_Declare(
ONEMATH ONEMATH
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
) )
FetchContent_MakeAvailable(ONEMATH) FetchContent_MakeAvailable(ONEMATH)
# Create alias to match with find_package targets name # Create alias to match with find_package targets name

View File

@@ -513,9 +513,9 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
bool gpu_has_xmx(sycl::device &dev); bool gpu_has_xmx(sycl::device &dev);
template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) { template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
if (LIKELY(!g_ggml_sycl_debug)) { if (LIKELY(!g_ggml_sycl_debug)) {
return; return "";
} }
std::stringstream ss; std::stringstream ss;
ss << prefix << "=["; ss << prefix << "=[";
@@ -526,29 +526,26 @@ template <int N, class T> void debug_print_array(const std::string & prefix, con
ss << array[N - 1]; ss << array[N - 1];
} }
ss << "]"; ss << "]";
GGML_SYCL_DEBUG("%s", ss.str().c_str()); return ss.str();
} }
inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor, inline std::string debug_get_tensor_str(const std::string &prefix,
const std::string & suffix = "") { const ggml_tensor *tensor, const std::string &suffix = "") {
if (LIKELY(!g_ggml_sycl_debug)) { std::stringstream ss;
return; if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
} ss << prefix.c_str() << "=";
GGML_SYCL_DEBUG("%s=", prefix.c_str());
if (tensor) { if (tensor) {
GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type)); ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne); ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb); ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
if (!ggml_is_contiguous(tensor)) {
GGML_SYCL_DEBUG(";strided"); if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
} if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
if (ggml_is_permuted(tensor)) {
GGML_SYCL_DEBUG(";permuted");
}
} else { } else {
GGML_SYCL_DEBUG("nullptr"); ss << "nullptr";
} }
GGML_SYCL_DEBUG("%s", suffix.c_str()); ss << suffix;
return ss.str();
} }
// Use scope_op_debug_print to log operations coming from running a model // Use scope_op_debug_print to log operations coming from running a model
@@ -564,10 +561,10 @@ struct scope_op_debug_print {
return; return;
} }
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data()); GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
debug_print_tensor(" dst", dst); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
if (dst) { if (dst) {
for (std::size_t i = 0; i < num_src; ++i) { for (std::size_t i = 0; i < num_src; ++i) {
debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]); GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
} }
} }
GGML_SYCL_DEBUG("%s\n", suffix.data()); GGML_SYCL_DEBUG("%s\n", suffix.data());

View File

@@ -723,8 +723,7 @@ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0));
std::string(" src0 type=") + ggml_type_name(src0->type));
const int64_t ne = ggml_nelements(src0); const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ne == ggml_nelements(src1));

View File

@@ -65,6 +65,9 @@ public:
dnnl::primitive_attr primitive_attr; dnnl::primitive_attr primitive_attr;
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
#ifdef GGML_SYCL_F16
primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
#endif
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a)); auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b)); auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));

View File

@@ -347,7 +347,7 @@ static enum ggml_status
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
ggml_tensor *tensor) try { ggml_tensor *tensor) try {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor, "\n"); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
if (tensor->view_src != NULL) { if (tensor->view_src != NULL) {
@@ -385,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
const void *data, size_t offset, const void *data, size_t offset,
size_t size) try { size_t size) try {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
ggml_sycl_set_device(ctx->device); ggml_sycl_set_device(ctx->device);
@@ -413,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
void *data, size_t offset, void *data, size_t offset,
size_t size) try { size_t size) try {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
@@ -444,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
ggml_tensor *dst) try { ggml_tensor *dst) try {
bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer); bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": dst=", dst); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
debug_print_tensor(" src=", src); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported); GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
if (is_cpy_supported) { if (is_cpy_supported) {
ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
@@ -525,7 +525,7 @@ catch (sycl::exception const &exc) {
static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
size_t offset, size_t size) { size_t offset, size_t size) {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value); GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
SYCL_CHECK(ggml_sycl_set_device(ctx->device)); SYCL_CHECK(ggml_sycl_set_device(ctx->device));
@@ -805,7 +805,7 @@ static enum ggml_status
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
ggml_tensor *tensor) try { ggml_tensor *tensor) try {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor, "\n"); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -891,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
ggml_tensor *tensor, const void *data, ggml_tensor *tensor, const void *data,
size_t offset, size_t size) try { size_t offset, size_t size) try {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
// split tensors must always be set in their entirety at once // split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0); GGML_ASSERT(offset == 0);
@@ -947,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
const ggml_tensor *tensor, void *data, const ggml_tensor *tensor, void *data,
size_t offset, size_t size) try { size_t offset, size_t size) try {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
// split tensors must always be set in their entirety at once // split tensors must always be set in their entirety at once
GGML_ASSERT(offset == 0); GGML_ASSERT(offset == 0);
@@ -2127,21 +2127,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16 const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
? (const sycl::half *)src1->data + src1_padded_row_size ? (const sycl::half *)src1->data + src1_padded_row_size
: src1_as_f16.get(); : src1_as_f16.get();
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
#if GGML_SYCL_DNNL #if GGML_SYCL_DNNL
if (!g_ggml_sycl_disable_dnn) { if (!g_ggml_sycl_disable_dnn) {
DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream); dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
" : converting dst to fp32");
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
} }
else else
#endif #endif
{ {
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
const sycl::half alpha_f16 = 1.0f; const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f; const sycl::half beta_f16 = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
@@ -3866,7 +3863,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
const void *data, size_t offset, const void *data, size_t offset,
size_t size) try { size_t size) try {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3887,7 +3884,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
void *data, size_t offset, void *data, size_t offset,
size_t size) try { size_t size) try {
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": tensor=", tensor); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3910,8 +3907,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
ggml_backend_buffer_is_sycl(src->buffer); ggml_backend_buffer_is_sycl(src->buffer);
GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
debug_print_tensor(": dst=", dst); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
debug_print_tensor(" src=", src); GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported); GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
if (is_cpy_supported) { if (is_cpy_supported) {
/* /*

View File

@@ -1,8 +1,13 @@
#include "llama-batch.h" #include "llama-batch.h"
#include "llama-impl.h"
#include "llama-cparams.h"
#include "llama-vocab.h"
#include <cassert> #include <cassert>
#include <cstring> #include <cstring>
#include <algorithm> #include <algorithm>
#include <sstream>
llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) { llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
// clear empty sequences // clear empty sequences
@@ -105,12 +110,7 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
ubatch.seq_id = batch->seq_id + seq.offset; ubatch.seq_id = batch->seq_id + seq.offset;
} }
} }
if (logits_all) { if (batch->logits) {
for (size_t i = 0; i < length; ++i) {
ubatch.output[ubatch.n_tokens + i] = 1;
out_ids.push_back(ids[seq.offset + i]);
}
} else if (batch->logits) {
if (ubatch.equal_seqs) { if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
size_t id = ids[seq.offset + i]; size_t id = ids[seq.offset + i];
@@ -197,11 +197,10 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
return ubatch; return ubatch;
} }
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) { llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split) {
GGML_ASSERT(batch.n_tokens >= 0); GGML_ASSERT(batch.n_tokens >= 0);
this->batch = &batch; this->batch = &batch;
this->n_embd = n_embd; this->n_embd = n_embd;
this->logits_all = logits_all;
n_tokens = batch.n_tokens; n_tokens = batch.n_tokens;
ids.resize(n_tokens); ids.resize(n_tokens);
@@ -285,9 +284,45 @@ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple
); );
} }
llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0) { llama_batch_allocr::llama_batch_allocr() {
batch = in_batch; const char * LLAMA_BATCH_DEBUG = getenv("LLAMA_BATCH_DEBUG");
debug = LLAMA_BATCH_DEBUG ? atoi(LLAMA_BATCH_DEBUG) : 0;
}
bool llama_batch_allocr::init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0) {
clear();
batch = batch_inp;
GGML_ASSERT(batch.n_tokens > 0); GGML_ASSERT(batch.n_tokens > 0);
if (!batch.pos) {
if (batch.seq_id) {
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
return false;
}
}
if (batch.token) {
for (int32_t i = 0; i < batch.n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return false;
}
}
}
if (batch.seq_id) {
for (int32_t i = 0; i < batch.n_tokens; ++i) {
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_PARALLEL_SEQUENCES);
return false;
}
}
}
}
if (!batch.pos) { if (!batch.pos) {
assert(p0 >= 0); assert(p0 >= 0);
pos.resize(batch.n_tokens); pos.resize(batch.n_tokens);
@@ -296,6 +331,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
} }
batch.pos = pos.data(); batch.pos = pos.data();
} }
if (!batch.n_seq_id) { if (!batch.n_seq_id) {
n_seq_id.resize(batch.n_tokens); n_seq_id.resize(batch.n_tokens);
for (int32_t i = 0; i < batch.n_tokens; i++) { for (int32_t i = 0; i < batch.n_tokens; i++) {
@@ -303,6 +339,7 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
} }
batch.n_seq_id = n_seq_id.data(); batch.n_seq_id = n_seq_id.data();
} }
if (!batch.seq_id) { if (!batch.seq_id) {
seq_id.resize(batch.n_tokens + 1); seq_id.resize(batch.n_tokens + 1);
seq_id[batch.n_tokens] = NULL; seq_id[batch.n_tokens] = NULL;
@@ -311,11 +348,84 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
} }
batch.seq_id = seq_id.data(); batch.seq_id = seq_id.data();
} }
if (!batch.logits) { if (!batch.logits) {
logits.resize(batch.n_tokens); // by default return the output only for the last token
logits[logits.size() - 1] = true; output.resize(batch.n_tokens);
batch.logits = logits.data(); output[output.size() - 1] = true;
batch.logits = output.data();
} }
for (int32_t i = 0; i < batch.n_tokens; ++i) {
n_outputs += batch.logits[i] != 0;
}
if (debug > 0) {
LLAMA_LOG_DEBUG("%s: input batch info (p0 = %d):\n", __func__, p0);
LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, batch.n_tokens);
LLAMA_LOG_DEBUG("%s: token = %p\n", __func__, (void *) batch.token);
LLAMA_LOG_DEBUG("%s: embd = %p\n", __func__, (void *) batch.embd);
LLAMA_LOG_DEBUG("%s: pos = %p\n", __func__, (void *) batch.pos);
LLAMA_LOG_DEBUG("%s: n_seq_id = %p\n", __func__, (void *) batch.n_seq_id);
LLAMA_LOG_DEBUG("%s: seq_id = %p\n", __func__, (void *) batch.seq_id);
LLAMA_LOG_DEBUG("%s: logits = %p\n", __func__, (void *) batch.logits);
LLAMA_LOG_DEBUG("%s: n_outputs = %d\n", __func__, n_outputs);
if (debug > 1) {
int seq_id_max = 0;
for (int32_t i = 0; i < batch.n_tokens; ++i) {
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
seq_id_max = std::max(seq_id_max, batch.seq_id[i][s]);
}
}
}
++seq_id_max;
LLAMA_LOG_DEBUG("%s: token = [\n", __func__);
for (int32_t i = 0; i < batch.n_tokens; ++i) {
std::vector<int8_t> seq_id(seq_id_max);
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
seq_id[batch.seq_id[i][s]] = 1;
}
std::stringstream ss;
for (int s = 0; s < seq_id_max; ++s) {
if (seq_id[s]) {
ss << s%10;
} else {
ss << ".";
}
}
LLAMA_LOG_DEBUG("%s: %4d: id = %6d (%16s), pos = %4d, n_seq_id = %2d, seq_id = [%s], output = %d\n",
__func__, i, batch.token[i], vocab.token_to_piece(batch.token[i]).c_str(),
batch.pos[i], batch.n_seq_id[i], ss.str().c_str(), batch.logits[i]);
}
LLAMA_LOG_DEBUG("%s: ]\n", __func__);
}
}
return true;
}
const llama_batch & llama_batch_allocr::get_batch() const {
return batch;
}
uint32_t llama_batch_allocr::get_n_outputs() const {
return n_outputs;
}
void llama_batch_allocr::clear() {
n_outputs = 0;
batch = {};
pos.clear();
n_seq_id.clear();
seq_id.clear();
output.clear();
} }
// //

View File

@@ -18,8 +18,8 @@ struct llama_ubatch {
llama_token * token; // [n_tokens] llama_token * token; // [n_tokens]
float * embd; // [n_embd, n_tokens] float * embd; // [n_embd, n_tokens]
llama_pos * pos; // [n_tokens] llama_pos * pos; // [n_tokens]
int32_t * n_seq_id; // [n_seqs] // TODO: remove, should belong to only 1 sequence int32_t * n_seq_id; // [n_seqs]
llama_seq_id ** seq_id; // [n_seqs] // TODO: become llama_seq_id * seq_id; llama_seq_id ** seq_id; // [n_seqs]
int8_t * output; // [n_tokens] int8_t * output; // [n_tokens]
}; };
@@ -39,8 +39,6 @@ struct llama_sbatch {
size_t n_embd; size_t n_embd;
bool logits_all; // TODO: remove once lctx.logits_all is removed too
// sorted indices into the batch // sorted indices into the batch
std::vector<int64_t> ids; std::vector<int64_t> ids;
// batch indices of the output // batch indices of the output
@@ -76,19 +74,34 @@ struct llama_sbatch {
llama_ubatch split_seq(size_t n_ubatch); llama_ubatch split_seq(size_t n_ubatch);
llama_sbatch() = default; llama_sbatch() = default;
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false); llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false);
}; };
// temporary allocate memory for the input batch if needed // temporary allocate memory for the input batch if needed
struct llama_batch_allocr { class llama_batch_allocr {
struct llama_batch batch; public:
llama_batch_allocr();
// optionally fulfill the batch returned by llama_batch_get_one
bool init(const llama_batch & batch_inp, const llama_vocab & vocab, llama_pos p0);
const llama_batch & get_batch() const;
uint32_t get_n_outputs() const;
private:
void clear();
llama_batch batch;
uint32_t n_outputs;
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
std::vector<llama_pos> pos; std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id; std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id; std::vector<llama_seq_id *> seq_id;
std::vector<int8_t> logits; std::vector<int8_t> output;
// optionally fulfill the batch returned by llama_batch_get_one int debug;
llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
}; };

View File

@@ -1,6 +1,7 @@
#include "llama-context.h" #include "llama-context.h"
#include "llama-impl.h" #include "llama-impl.h"
#include "llama-batch.h"
#include "llama-io.h" #include "llama-io.h"
#include "llama-memory.h" #include "llama-memory.h"
#include "llama-mmap.h" #include "llama-mmap.h"
@@ -18,7 +19,8 @@
llama_context::llama_context( llama_context::llama_context(
const llama_model & model, const llama_model & model,
llama_context_params params) : llama_context_params params) :
model(model) { model(model),
batch_allocr(std::make_unique<llama_batch_allocr>()) {
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__); LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
t_start_us = model.t_start_us; t_start_us = model.t_start_us;
@@ -494,7 +496,7 @@ float * llama_context::get_logits() {
} }
float * llama_context::get_logits_ith(int32_t i) { float * llama_context::get_logits_ith(int32_t i) {
int32_t j = -1; int64_t j = -1;
try { try {
if (logits == nullptr) { if (logits == nullptr) {
@@ -517,7 +519,7 @@ float * llama_context::get_logits_ith(int32_t i) {
} }
if (j >= n_outputs) { if (j >= n_outputs) {
// This should not happen // This should not happen
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
} }
return logits + j*model.vocab.n_tokens(); return logits + j*model.vocab.n_tokens();
@@ -536,7 +538,7 @@ float * llama_context::get_embeddings() {
} }
float * llama_context::get_embeddings_ith(int32_t i) { float * llama_context::get_embeddings_ith(int32_t i) {
int32_t j = -1; int64_t j = -1;
try { try {
if (embd == nullptr) { if (embd == nullptr) {
@@ -559,7 +561,7 @@ float * llama_context::get_embeddings_ith(int32_t i) {
} }
if (j >= n_outputs) { if (j >= n_outputs) {
// This should not happen // This should not happen
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs)); throw std::runtime_error(format("corrupt output buffer (j=%" PRId64 ", n_outputs=%d)", j, n_outputs));
} }
return embd + j*model.hparams.n_embd; return embd + j*model.hparams.n_embd;
@@ -719,52 +721,42 @@ llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch,
return res; return res;
} }
int llama_context::encode(llama_batch & inp_batch) { int llama_context::encode(const llama_batch & batch_inp) {
if (inp_batch.n_tokens == 0) { if (batch_inp.n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1; return -1;
} }
// temporary allocate memory for the input batch if needed // temporary allocate memory for the input batch if needed
// note: during encode, we always pass the full sequence starting from pos = 0 // note: during encode, we always pass the full sequence starting from pos = 0
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0); if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : 0)) {
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
return -1;
}
const llama_batch & batch = batch_allocr.batch; const llama_batch & batch = batch_allocr->get_batch();
const int32_t n_tokens = batch.n_tokens;
const auto & hparams = model.hparams; const uint32_t n_tokens = batch.n_tokens;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
// TODO: move the validation to the llama_batch_allocr
if (batch.token) {
for (int32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1;
}
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) {
LLAMA_LOG_ERROR("%s: invalid seq_id[%d] = %d > %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
throw -1;
}
}
}
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens"); GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
if (t_compute_start_us == 0) { if (t_compute_start_us == 0) {
t_compute_start_us = ggml_time_us(); t_compute_start_us = ggml_time_us();
} }
// TODO: this clear of the buffer can easily be forgotten - need something better
embd_seq.clear(); embd_seq.clear();
n_queued_tokens += n_tokens; n_queued_tokens += n_tokens;
const auto & hparams = model.hparams;
const int64_t n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true); llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true);
const llama_ubatch ubatch = sbatch.split_simple(n_tokens); const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -774,7 +766,7 @@ int llama_context::encode(llama_batch & inp_batch) {
return -2; return -2;
}; };
for (int32_t i = 0; i < n_tokens; ++i) { for (uint32_t i = 0; i < n_tokens; ++i) {
output_ids[i] = i; output_ids[i] = i;
} }
@@ -830,7 +822,8 @@ int llama_context::encode(llama_batch & inp_batch) {
GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
for (int32_t i = 0; i < n_tokens; i++) { // TODO: fix indexing [UBATCH_IDX]
for (uint32_t i = 0; i < n_tokens; i++) {
const llama_seq_id seq_id = ubatch.seq_id[i][0]; const llama_seq_id seq_id = ubatch.seq_id[i][0];
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
continue; continue;
@@ -845,6 +838,7 @@ int llama_context::encode(llama_batch & inp_batch) {
auto & embd_seq_out = embd_seq; auto & embd_seq_out = embd_seq;
const uint32_t n_cls_out = hparams.n_cls_out; const uint32_t n_cls_out = hparams.n_cls_out;
// TODO: fix indexing [UBATCH_IDX]
for (uint32_t s = 0; s < ubatch.n_seqs; ++s) { for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
const llama_seq_id seq_id = ubatch.seq_id[s][0]; const llama_seq_id seq_id = ubatch.seq_id[s][0];
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) { if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
@@ -878,10 +872,10 @@ int llama_context::encode(llama_batch & inp_batch) {
// remember the sequence ids used during the encoding - needed for cross attention later // remember the sequence ids used during the encoding - needed for cross attention later
cross.seq_ids_enc.resize(n_tokens); cross.seq_ids_enc.resize(n_tokens);
for (int32_t i = 0; i < n_tokens; i++) { for (uint32_t i = 0; i < n_tokens; i++) {
cross.seq_ids_enc[i].clear(); cross.seq_ids_enc[i].clear();
for (int s = 0; s < ubatch.n_seq_id[i]; s++) { for (int s = 0; s < batch.n_seq_id[i]; s++) {
llama_seq_id seq_id = ubatch.seq_id[i][s]; llama_seq_id seq_id = batch.seq_id[i][s];
cross.seq_ids_enc[i].insert(seq_id); cross.seq_ids_enc[i].insert(seq_id);
} }
} }
@@ -890,51 +884,46 @@ int llama_context::encode(llama_batch & inp_batch) {
return 0; return 0;
} }
int llama_context::decode(llama_batch & inp_batch) { int llama_context::decode(const llama_batch & batch_inp) {
if (!memory) { if (!memory) {
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__); LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
return encode(inp_batch); return encode(batch_inp);
} }
if (inp_batch.n_tokens == 0) { if (batch_inp.n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__); LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1; return -1;
} }
if (!inp_batch.pos) { // temporary allocate memory for the input batch if needed
if (inp_batch.seq_id) { if (!batch_allocr->init(batch_inp, model.vocab, batch_inp.pos ? -1 : memory->seq_pos_max(0) + 1)) {
LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__); LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
return -1; return -1;
}
} }
// temporary allocate memory for the input batch if needed const llama_batch & batch = batch_allocr->get_batch();
llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : memory->seq_pos_max(0) + 1);
const llama_batch & batch = batch_allocr.batch;
const auto & vocab = model.vocab; const auto & vocab = model.vocab;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const int32_t n_vocab = vocab.n_tokens(); const int32_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
const int64_t n_tokens_all = batch.n_tokens; const uint32_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
// TODO: move the validation to the llama_batch_allocr // this indicates we are doing pooled embedding
if (batch.token) { const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
for (int64_t i = 0; i < n_tokens_all; ++i) {
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
return -1;
}
if (batch.seq_id && (batch.seq_id[i][0] < 0 || batch.seq_id[i][0] >= LLAMA_MAX_PARALLEL_SEQUENCES)) { const uint32_t n_outputs_all = batch_allocr->get_n_outputs();
LLAMA_LOG_ERROR("%s: invalid seq_id[%" PRId64 "] = %d >= %d\n", __func__, i, batch.seq_id[i][0], LLAMA_MAX_PARALLEL_SEQUENCES);
return -1; if (embd_pooled) {
} // require that all tokens are output
if (n_outputs_all != n_tokens_all) {
LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
__func__, n_outputs_all, n_tokens_all);
return -1;
} }
} }
@@ -947,25 +936,9 @@ int llama_context::decode(llama_batch & inp_batch) {
} }
n_queued_tokens += n_tokens_all; n_queued_tokens += n_tokens_all;
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens // TODO: this clear of the buffer can easily be forgotten - need something better
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
embd_seq.clear(); embd_seq.clear();
int64_t n_outputs_all = 0;
// count outputs
if (batch.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
}
} else if (embd_pooled) {
n_outputs_all = n_tokens_all;
} else {
// keep last output only
n_outputs_all = 1;
}
bool did_optimize = false; bool did_optimize = false;
// handle any pending defrags/shifts // handle any pending defrags/shifts
@@ -974,7 +947,7 @@ int llama_context::decode(llama_batch & inp_batch) {
llama_memory_state_ptr mstate; llama_memory_state_ptr mstate;
while (true) { while (true) {
mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all); mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled);
if (!mstate) { if (!mstate) {
return -2; return -2;
} }
@@ -1018,7 +991,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// reserve output buffer // reserve output buffer
if (output_reserve(n_outputs_all) < n_outputs_all) { if (output_reserve(n_outputs_all) < n_outputs_all) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
return -2; return -2;
}; };
@@ -1027,7 +1000,7 @@ int llama_context::decode(llama_batch & inp_batch) {
do { do {
const auto & ubatch = mstate->get_ubatch(); const auto & ubatch = mstate->get_ubatch();
// count the outputs in this u_batch // count the outputs in this ubatch
{ {
int32_t n_outputs_new = 0; int32_t n_outputs_new = 0;
@@ -1057,6 +1030,7 @@ int llama_context::decode(llama_batch & inp_batch) {
pos_min[s] = std::numeric_limits<llama_pos>::max(); pos_min[s] = std::numeric_limits<llama_pos>::max();
} }
// TODO: fix sequence indexing
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
const auto & seq_id = ubatch.seq_id[i][0]; const auto & seq_id = ubatch.seq_id[i][0];
@@ -1170,14 +1144,14 @@ int llama_context::decode(llama_batch & inp_batch) {
n_outputs = n_outputs_all; n_outputs = n_outputs_all;
// set output mappings // set output mappings
{ if (n_outputs > 0) {
bool sorted_output = true; bool sorted_output = true;
auto & out_ids = mstate->out_ids(); auto & out_ids = mstate->out_ids();
GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all); GGML_ASSERT(out_ids.size() == (size_t) n_outputs);
for (int64_t i = 0; i < n_outputs_all; ++i) { for (int64_t i = 0; i < n_outputs; ++i) {
int64_t out_id = out_ids[i]; int64_t out_id = out_ids[i];
output_ids[out_id] = i; output_ids[out_id] = i;
if (out_id != i) { if (out_id != i) {
@@ -1189,20 +1163,22 @@ int llama_context::decode(llama_batch & inp_batch) {
// note: this is mostly relevant for recurrent models atm // note: this is mostly relevant for recurrent models atm
if (!sorted_output) { if (!sorted_output) {
const uint32_t n_vocab = model.vocab.n_tokens(); const uint32_t n_vocab = model.vocab.n_tokens();
const uint32_t n_embd = model.hparams.n_embd; const uint64_t n_embd = model.hparams.n_embd;
GGML_ASSERT((size_t) n_outputs == out_ids.size()); GGML_ASSERT((size_t) n_outputs == out_ids.size());
// TODO: is there something more efficient which also minimizes swaps? // TODO: is there something more efficient which also minimizes swaps?
// selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort) // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
for (int32_t i = 0; i < n_outputs - 1; ++i) { for (uint32_t i = 0; i < n_outputs - 1; ++i) {
int32_t j_min = i; uint32_t j_min = i;
for (int32_t j = i + 1; j < n_outputs; ++j) { for (uint32_t j = i + 1; j < n_outputs; ++j) {
if (out_ids[j] < out_ids[j_min]) { if (out_ids[j] < out_ids[j_min]) {
j_min = j; j_min = j;
} }
} }
if (j_min == i) { continue; } if (j_min == i) {
continue;
}
std::swap(out_ids[i], out_ids[j_min]); std::swap(out_ids[i], out_ids[j_min]);
if (logits_size > 0) { if (logits_size > 0) {
for (uint32_t k = 0; k < n_vocab; k++) { for (uint32_t k = 0; k < n_vocab; k++) {
@@ -1215,8 +1191,10 @@ int llama_context::decode(llama_batch & inp_batch) {
} }
} }
} }
std::fill(output_ids.begin(), output_ids.end(), -1); std::fill(output_ids.begin(), output_ids.end(), -1);
for (int32_t i = 0; i < n_outputs; ++i) {
for (uint32_t i = 0; i < n_outputs; ++i) {
output_ids[out_ids[i]] = i; output_ids[out_ids[i]] = i;
} }
} }
@@ -1236,7 +1214,7 @@ int llama_context::decode(llama_batch & inp_batch) {
// output // output
// //
int32_t llama_context::output_reserve(int32_t n_outputs) { uint32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const auto & vocab = model.vocab; const auto & vocab = model.vocab;
@@ -1302,8 +1280,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
// set all ids as invalid (negative) // set all ids as invalid (negative)
std::fill(output_ids.begin(), output_ids.end(), -1); std::fill(output_ids.begin(), output_ids.end(), -1);
this->n_outputs = 0; this->n_outputs = 0;
this->n_outputs_max = n_outputs_max;
return n_outputs_max; return n_outputs_max;
} }
@@ -1332,7 +1309,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
if (n_tokens % n_seqs != 0) { if (n_tokens % n_seqs != 0) {
n_tokens = (n_tokens / n_seqs) * n_seqs; n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
n_outputs = std::min(n_outputs, n_tokens); n_outputs = std::min(n_outputs, n_tokens);
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs); LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
@@ -1794,14 +1771,12 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
std::vector<int32_t> w_output_pos; std::vector<int32_t> w_output_pos;
GGML_ASSERT(n_outputs <= n_outputs_max);
w_output_pos.resize(n_outputs); w_output_pos.resize(n_outputs);
// build a more compact representation of the output ids // build a more compact representation of the output ids
for (size_t i = 0; i < n_batch(); ++i) { for (size_t i = 0; i < n_batch(); ++i) {
// map an output id to a position in the batch // map an output id to a position in the batch
int32_t pos = output_ids[i]; int64_t pos = output_ids[i];
if (pos >= 0) { if (pos >= 0) {
GGML_ASSERT(pos < n_outputs); GGML_ASSERT(pos < n_outputs);
w_output_pos[pos] = i; w_output_pos[pos] = i;
@@ -2071,14 +2046,14 @@ void llama_context::opt_epoch_iter(
n_queued_tokens += n_tokens_all; n_queued_tokens += n_tokens_all;
// this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens // this indicates we are doing pooled embedding
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
embd_seq.clear(); embd_seq.clear();
int64_t n_outputs_all = n_tokens_all; uint32_t n_outputs_all = n_tokens_all;
auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ true); auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled);
if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) { if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__); LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
break; break;
@@ -2086,7 +2061,7 @@ void llama_context::opt_epoch_iter(
// reserve output buffer // reserve output buffer
if (output_reserve(n_outputs_all) < n_outputs_all) { if (output_reserve(n_outputs_all) < n_outputs_all) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
GGML_ABORT("TODO: handle this error"); GGML_ABORT("TODO: handle this error");
}; };

View File

@@ -1,7 +1,6 @@
#pragma once #pragma once
#include "llama.h" #include "llama.h"
#include "llama-batch.h"
#include "llama-cparams.h" #include "llama-cparams.h"
#include "llama-graph.h" #include "llama-graph.h"
#include "llama-adapter.h" #include "llama-adapter.h"
@@ -13,6 +12,7 @@
#include <vector> #include <vector>
struct llama_model; struct llama_model;
class llama_batch_allocr;
class llama_io_read_i; class llama_io_read_i;
class llama_io_write_i; class llama_io_write_i;
@@ -102,8 +102,8 @@ struct llama_context {
llama_memory_state_i * mstate, llama_memory_state_i * mstate,
ggml_status & ret); ggml_status & ret);
int encode(llama_batch & inp_batch); int encode(const llama_batch & batch_inp);
int decode(llama_batch & inp_batch); int decode(const llama_batch & batch_inp);
// //
// state save/load // state save/load
@@ -181,7 +181,7 @@ private:
// Make sure enough space is available for outputs. // Make sure enough space is available for outputs.
// Returns max number of outputs for which space was reserved. // Returns max number of outputs for which space was reserved.
int32_t output_reserve(int32_t n_outputs); uint32_t output_reserve(int32_t n_outputs);
// //
// graph // graph
@@ -246,8 +246,10 @@ private:
// populated only when pooling_type != LLAMA_POOLING_TYPE_NONE // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
std::map<llama_seq_id, std::vector<float>> embd_seq; std::map<llama_seq_id, std::vector<float>> embd_seq;
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch // reuse the batch_allocr to avoid unnecessary memory allocations
int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers std::unique_ptr<llama_batch_allocr> batch_allocr;
uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers

View File

@@ -139,6 +139,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
std::vector<uint64_t> sum(n_tokens, 0); std::vector<uint64_t> sum(n_tokens, 0);
// TODO: fix indexing [UBATCH_IDX]
for (int s = 0; s < n_seqs; ++s) { for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0]; const llama_seq_id seq_id = ubatch->seq_id[s][0];
@@ -156,6 +157,7 @@ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
} }
} }
// TODO: fix indexing [UBATCH_IDX]
for (int s = 0; s < n_seqs; ++s) { for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0]; const llama_seq_id seq_id = ubatch->seq_id[s][0];
@@ -180,6 +182,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
uint32_t * data = (uint32_t *) cls->data; uint32_t * data = (uint32_t *) cls->data;
memset(cls->data, 0, n_tokens * ggml_element_size(cls)); memset(cls->data, 0, n_tokens * ggml_element_size(cls));
// TODO: fix indexing [UBATCH_IDX]
for (int s = 0; s < n_seqs; ++s) { for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0]; const llama_seq_id seq_id = ubatch->seq_id[s][0];
@@ -210,6 +213,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
std::vector<int> last_pos(n_tokens, -1); std::vector<int> last_pos(n_tokens, -1);
std::vector<int> last_row(n_tokens, -1); std::vector<int> last_row(n_tokens, -1);
// TODO: fix indexing [UBATCH_IDX]
for (int s = 0; s < n_seqs; ++s) { for (int s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0]; const llama_seq_id seq_id = ubatch->seq_id[s][0];
@@ -283,6 +287,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const int32_t ti = s0*n_seq_tokens + i; const int32_t ti = s0*n_seq_tokens + i;
float f = -INFINITY; float f = -INFINITY;
// TODO: fix indexing [UBATCH_IDX]
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) { if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
if (hparams.use_alibi) { if (hparams.use_alibi) {
@@ -322,6 +327,7 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
const int32_t ti = s0*n_seq_tokens + i; const int32_t ti = s0*n_seq_tokens + i;
float f = -INFINITY; float f = -INFINITY;
// TODO: fix indexing [UBATCH_IDX]
for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) { for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
if (ubatch->seq_id[s0][s] == seq_id) { if (ubatch->seq_id[s0][s] == seq_id) {
if (hparams.use_alibi) { if (hparams.use_alibi) {
@@ -377,6 +383,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
for (int j = 0; j < n_tokens; ++j) { for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_enc; ++i) { for (int i = 0; i < n_enc; ++i) {
float f = -INFINITY; float f = -INFINITY;
// TODO: fix indexing [UBATCH_IDX]
for (int s = 0; s < ubatch->n_seq_id[j]; ++s) { for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[j][s]; const llama_seq_id seq_id = ubatch->seq_id[j][s];
if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) { if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
@@ -1556,23 +1563,30 @@ void llm_graph_context::build_pooling(
ggml_tensor * inp_cls = build_inp_cls(); ggml_tensor * inp_cls = build_inp_cls();
inp = ggml_get_rows(ctx0, inp, inp_cls); inp = ggml_get_rows(ctx0, inp, inp_cls);
if (cls != nullptr && cls_b != nullptr) { if (cls) {
// classification head // classification head
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b); cur = ggml_mul_mat(ctx0, cls, inp);
if (cls_b) {
cur = ggml_add(ctx0, cur, cls_b);
}
cur = ggml_tanh(ctx0, cur); cur = ggml_tanh(ctx0, cur);
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
if (cls_out) { if (cls_out) {
GGML_ASSERT(cls_out_b != nullptr); cur = ggml_mul_mat(ctx0, cls_out, cur);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b); if (cls_out_b) {
cur = ggml_add(ctx0, cur, cls_out_b);
}
} }
} else if (cls_out) { } else if (cls_out) {
// Single layer classification head (direct projection) // Single layer classification head (direct projection)
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476 // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
GGML_ASSERT(cls_out_b != nullptr); cur = ggml_mul_mat(ctx0, cls_out, inp);
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b); if (cls_out_b) {
cur = ggml_add(ctx0, cur, cls_out_b);
}
} else { } else {
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b"); GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
} }

View File

@@ -378,7 +378,7 @@ struct llm_graph_params {
const llama_memory_state_i * mstate; const llama_memory_state_i * mstate;
const llama_cross * cross; const llama_cross * cross;
int32_t n_outputs; uint32_t n_outputs;
const llm_graph_cb & cb; const llm_graph_cb & cb;
}; };
@@ -412,8 +412,8 @@ struct llm_graph_context {
const float norm_eps; const float norm_eps;
const float norm_rms_eps; const float norm_rms_eps;
const int32_t n_tokens; const int64_t n_tokens;
const int32_t n_outputs; const int64_t n_outputs;
const int32_t n_ctx_orig; // yarn const int32_t n_ctx_orig; // yarn
const enum llama_pooling_type pooling_type; const enum llama_pooling_type pooling_type;

View File

@@ -359,10 +359,10 @@ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
return result; return result;
} }
llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) { llama_memory_state_ptr llama_kv_cache_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled) {
GGML_UNUSED(embd_pooled); GGML_UNUSED(embd_pooled);
auto sbatch = llama_sbatch(batch, hparams.n_embd, false, logits_all); auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
std::vector<llama_ubatch> ubatches; std::vector<llama_ubatch> ubatches;

View File

@@ -32,8 +32,7 @@ public:
llama_memory_state_ptr init_batch( llama_memory_state_ptr init_batch(
const llama_batch & batch, const llama_batch & batch,
uint32_t n_ubatch, uint32_t n_ubatch,
bool embd_pooled, bool embd_pooled) override;
bool logits_all) override;
llama_memory_state_ptr init_full() override; llama_memory_state_ptr init_full() override;

View File

@@ -95,36 +95,69 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
return kv_swa->seq_pos_max(seq_id); return kv_swa->seq_pos_max(seq_id);
} }
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) { llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled) {
GGML_UNUSED(embd_pooled); GGML_UNUSED(embd_pooled);
// TODO: if we fail with split_simple, we should attempt different splitting strategies // first try simple split
do {
auto sbatch = llama_sbatch(batch, hparams.n_embd, true);
std::vector<llama_ubatch> ubatches;
while (sbatch.n_tokens > 0) {
auto ubatch = sbatch.split_simple(n_ubatch);
ubatches.push_back(ubatch);
}
auto heads_base = kv_base->prepare(ubatches);
if (heads_base.empty()) {
break;
}
auto heads_swa = kv_swa->prepare(ubatches);
if (heads_swa.empty()) {
break;
}
assert(heads_base.size() == heads_swa.size());
return std::make_unique<llama_kv_cache_unified_iswa_state>(
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
} while (false);
// if it fails, try equal split
do {
auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
std::vector<llama_ubatch> ubatches;
while (sbatch.n_tokens > 0) {
auto ubatch = sbatch.split_equal(n_ubatch);
ubatches.push_back(ubatch);
}
auto heads_base = kv_base->prepare(ubatches);
if (heads_base.empty()) {
break;
}
auto heads_swa = kv_swa->prepare(ubatches);
if (heads_swa.empty()) {
break;
}
assert(heads_base.size() == heads_swa.size());
return std::make_unique<llama_kv_cache_unified_iswa_state>(
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
} while (false);
// TODO: if we fail again, we should attempt different splitting strategies
// but to do that properly, we first have to refactor the batches to be more flexible // but to do that properly, we first have to refactor the batches to be more flexible
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
std::vector<llama_ubatch> ubatches;
while (sbatch.n_tokens > 0) {
auto ubatch = sbatch.split_simple(n_ubatch);
ubatches.push_back(ubatch);
}
auto heads_base = kv_base->prepare(ubatches);
if (heads_base.empty()) {
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
}
auto heads_swa = kv_swa->prepare(ubatches);
if (heads_swa.empty()) {
return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
}
assert(heads_base.size() == heads_swa.size());
return std::make_unique<llama_kv_cache_unified_iswa_state>(
this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
} }
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() { llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {

View File

@@ -34,8 +34,7 @@ public:
llama_memory_state_ptr init_batch( llama_memory_state_ptr init_batch(
const llama_batch & batch, const llama_batch & batch,
uint32_t n_ubatch, uint32_t n_ubatch,
bool embd_pooled, bool embd_pooled) override;
bool logits_all) override;
llama_memory_state_ptr init_full() override; llama_memory_state_ptr init_full() override;

View File

@@ -310,24 +310,27 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
llama_memory_state_ptr llama_kv_cache_unified::init_batch( llama_memory_state_ptr llama_kv_cache_unified::init_batch(
const llama_batch & batch, const llama_batch & batch,
uint32_t n_ubatch, uint32_t n_ubatch,
bool embd_pooled, bool embd_pooled) {
bool logits_all) {
GGML_UNUSED(embd_pooled); GGML_UNUSED(embd_pooled);
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); do {
auto sbatch = llama_sbatch(batch, hparams.n_embd, true);
std::vector<llama_ubatch> ubatches; std::vector<llama_ubatch> ubatches;
while (sbatch.n_tokens > 0) { while (sbatch.n_tokens > 0) {
ubatches.push_back(sbatch.split_simple(n_ubatch)); ubatches.push_back(sbatch.split_simple(n_ubatch));
} }
auto heads = prepare(ubatches); auto heads = prepare(ubatches);
if (heads.empty()) { if (heads.empty()) {
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE); break;
} }
return std::make_unique<llama_kv_cache_unified_state>( return std::make_unique<llama_kv_cache_unified_state>(
this, std::move(sbatch), std::move(heads), std::move(ubatches)); this, std::move(sbatch), std::move(heads), std::move(ubatches));
} while (false);
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
} }
llama_memory_state_ptr llama_kv_cache_unified::init_full() { llama_memory_state_ptr llama_kv_cache_unified::init_full() {
@@ -521,7 +524,6 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
} }
if (debug > 0) { if (debug > 0) {
LLAMA_LOG_CONT("\n");
LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa); LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", __func__, cells.used_max_p1(), cells.get_used(), head, get_size(), n_swa);
if ((debug == 2 && n_swa > 0) || debug > 2) { if ((debug == 2 && n_swa > 0) || debug > 2) {
@@ -530,7 +532,13 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
if (cells.is_empty(i)) { if (cells.is_empty(i)) {
ss += '.'; ss += '.';
} else { } else {
ss += std::to_string(cells.seq_get(i)); assert(cells.seq_count(i) >= 1);
if (cells.seq_count(i) == 1) {
ss += std::to_string(cells.seq_get(i));
} else {
ss += 'M';
}
} }
if (i%256 == 255) { if (i%256 == 255) {
ss += " *"; ss += " *";
@@ -636,6 +644,12 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
} }
void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) { void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
if (debug > 0) {
LLAMA_LOG_DEBUG("%s: ubatch info:\n", __func__);
LLAMA_LOG_DEBUG("%s: n_tokens = %d, equal_seqs = %d\n", __func__, ubatch.n_tokens, ubatch.equal_seqs);
LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d, n_seqs = %d\n", __func__, ubatch.n_seq_tokens, ubatch.n_seqs);
}
// keep track of the max sequence position that we would overwrite with this ubatch // keep track of the max sequence position that we would overwrite with this ubatch
// for non-SWA cache, this would be always empty // for non-SWA cache, this would be always empty
llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES]; llama_seq_id seq_pos_max_rm[LLAMA_MAX_PARALLEL_SEQUENCES];
@@ -643,22 +657,27 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
seq_pos_max_rm[s] = -1; seq_pos_max_rm[s] = -1;
} }
for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
if (!cells.is_empty(head_cur + i)) { for (uint32_t j = 0; j < ubatch.n_seq_tokens; ++j) {
assert(cells.seq_count(head_cur + i) == 1); const uint32_t idx = s*ubatch.n_seq_tokens + j;
const llama_seq_id seq_id = cells.seq_get(head_cur + i); if (!cells.is_empty(head_cur + idx)) {
const llama_pos pos = cells.pos_get(head_cur + i); assert(cells.seq_count(head_cur + idx) == 1);
seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos); const llama_seq_id seq_id = cells.seq_get(head_cur + idx);
const llama_pos pos = cells.pos_get(head_cur + idx);
cells.rm(head_cur + i); seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
}
cells.pos_set(head_cur + i, ubatch.pos[i]); cells.rm(head_cur + idx);
}
for (int32_t j = 0; j < ubatch.n_seq_id[i]; j++) { cells.pos_set(head_cur + idx, ubatch.pos[idx]);
cells.seq_add(head_cur + i, ubatch.seq_id[i][j]);
// TODO: fix indexing [UBATCH_IDX]
for (int32_t i = 0; i < ubatch.n_seq_id[s]; i++) {
cells.seq_add(head_cur + idx, ubatch.seq_id[s][i]);
}
} }
} }
@@ -677,7 +696,6 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1); seq_rm(s, cells.seq_pos_min(s), seq_pos_max_rm[s] + 1);
} }
} }
// move the head at the end of the slot // move the head at the end of the slot
head = head_cur + ubatch.n_tokens; head = head_cur + ubatch.n_tokens;
} }
@@ -774,14 +792,14 @@ ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_
} }
void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const { void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
const int64_t n_tokens = ubatch->n_tokens; const uint32_t n_tokens = ubatch->n_tokens;
const int64_t n_seq_tokens = ubatch->n_seq_tokens; const uint32_t n_seq_tokens = ubatch->n_seq_tokens;
const int64_t n_seqs = ubatch->n_seqs; const uint32_t n_seqs = ubatch->n_seqs;
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer)); GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
float * data = (float *) dst->data; float * data = (float *) dst->data;
const auto n_kv = dst->ne[0]; const int64_t n_kv = dst->ne[0];
// Use only the previous KV cells of the correct sequence for each token of the ubatch. // Use only the previous KV cells of the correct sequence for each token of the ubatch.
// It's assumed that if a token in the batch has multiple sequences, they are equivalent. // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
@@ -795,12 +813,14 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
// xxxxx----- // xxxxx-----
// xxxxx----- // xxxxx-----
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615 // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
for (int h = 0; h < 1; ++h) { for (uint32_t h = 0; h < 1; ++h) {
for (int s = 0; s < n_seqs; ++s) { for (uint32_t s = 0; s < n_seqs; ++s) {
const llama_seq_id seq_id = ubatch->seq_id[s][0]; const llama_seq_id seq_id = ubatch->seq_id[s][0];
for (int j = 0; j < n_seq_tokens; ++j) { for (uint32_t j = 0; j < n_seq_tokens; ++j) {
const llama_pos p1 = ubatch->pos[s*n_seq_tokens + j]; const uint32_t idx = s*n_seq_tokens + j;
const llama_pos p1 = ubatch->pos[idx];
for (uint32_t i = 0; i < n_kv; ++i) { for (uint32_t i = 0; i < n_kv; ++i) {
float f = 0.0f; float f = 0.0f;
@@ -830,16 +850,16 @@ void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ub
f = -INFINITY; f = -INFINITY;
} }
data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f; data[h*(n_kv*n_tokens) + idx*n_kv + i] = f;
} }
} }
} }
// mask padded tokens // mask padded tokens
if (data) { if (data) {
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) { for (uint32_t j = n_tokens; j < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++j) {
for (uint32_t j = 0; j < n_kv; ++j) { for (uint32_t i = 0; i < n_kv; ++i) {
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY; data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
} }
} }
} }
@@ -1490,9 +1510,11 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
seq_rm(dest_seq_id, -1, -1); seq_rm(dest_seq_id, -1, -1);
llama_sbatch sbatch; llama_sbatch sbatch;
llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); llama_ubatch ubatch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
batch.n_tokens = cell_count; ubatch.n_tokens = cell_count;
ubatch.n_seq_tokens = cell_count;
ubatch.n_seqs = 1;
for (uint32_t i = 0; i < cell_count; ++i) { for (uint32_t i = 0; i < cell_count; ++i) {
llama_pos pos; llama_pos pos;
@@ -1512,18 +1534,18 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
io.read_to(&seq_id, sizeof(seq_id)); io.read_to(&seq_id, sizeof(seq_id));
} }
batch.pos[i] = pos; ubatch.pos[i] = pos;
batch.n_seq_id[i] = n_seq_id; ubatch.n_seq_id[i] = n_seq_id;
batch.seq_id[i] = &dest_seq_id; ubatch.seq_id[i] = &dest_seq_id;
} }
const auto head_cur = find_slot(batch); const auto head_cur = find_slot(ubatch);
if (head_cur < 0) { if (head_cur < 0) {
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
return false; return false;
} }
apply_ubatch(head_cur, batch); apply_ubatch(head_cur, ubatch);
// keep the head at the old position because we will read the KV data into it in state_read_data() // keep the head at the old position because we will read the KV data into it in state_read_data()
head = head_cur; head = head_cur;
@@ -1531,8 +1553,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
// DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values) // DEBUG CHECK: head_cur should be our first cell, head_cur + cell_count - 1 should be our last cell (verify seq_id and pos values)
// Assume that this is one contiguous block of cells // Assume that this is one contiguous block of cells
GGML_ASSERT(head_cur + cell_count <= cells.size()); GGML_ASSERT(head_cur + cell_count <= cells.size());
GGML_ASSERT(cells.pos_get(head_cur) == batch.pos[0]); GGML_ASSERT(cells.pos_get(head_cur) == ubatch.pos[0]);
GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == batch.pos[cell_count - 1]); GGML_ASSERT(cells.pos_get(head_cur + cell_count - 1) == ubatch.pos[cell_count - 1]);
GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id)); GGML_ASSERT(cells.seq_has(head_cur, dest_seq_id));
GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id)); GGML_ASSERT(cells.seq_has(head_cur + cell_count - 1, dest_seq_id));
} else { } else {

View File

@@ -59,8 +59,7 @@ public:
llama_memory_state_ptr init_batch( llama_memory_state_ptr init_batch(
const llama_batch & batch, const llama_batch & batch,
uint32_t n_ubatch, uint32_t n_ubatch,
bool embd_pooled, bool embd_pooled) override;
bool logits_all) override;
llama_memory_state_ptr init_full() override; llama_memory_state_ptr init_full() override;

View File

@@ -73,8 +73,7 @@ struct llama_memory_i {
virtual llama_memory_state_ptr init_batch( virtual llama_memory_state_ptr init_batch(
const llama_batch & batch, const llama_batch & batch,
uint32_t n_ubatch, uint32_t n_ubatch,
bool embd_pooled, bool embd_pooled) = 0;
bool logits_all) = 0;
// simulate full cache, used for allocating worst-case compute buffers // simulate full cache, used for allocating worst-case compute buffers
virtual llama_memory_state_ptr init_full() = 0; virtual llama_memory_state_ptr init_full() = 0;

View File

@@ -9,16 +9,16 @@
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cctype>
#include <cfloat> #include <cfloat>
#include <climits>
#include <cstdarg> #include <cstdarg>
#include <cstring> #include <cstring>
#include <forward_list> #include <forward_list>
#include <limits>
#include <map> #include <map>
#include <queue> #include <queue>
#include <set> #include <set>
#include <unordered_map> #include <unordered_map>
#include <cctype>
// //
// helpers // helpers
@@ -2572,6 +2572,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
// copy piece chars to output text buffer // copy piece chars to output text buffer
// skip up to 'lstrip' leading spaces before copying // skip up to 'lstrip' leading spaces before copying
auto _try_copy = [=] (const char * token, size_t size) -> int32_t { auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
}
for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
token++; token++;
size--; size--;
@@ -2768,26 +2772,26 @@ void llama_vocab::impl::print_info() const {
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size()); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
// special tokens // special tokens
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); } if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); } if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); } if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); } if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); } if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); } if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); } if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); } if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); } if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); } if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); } if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); } if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); } if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); } if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); } if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
for (const auto & id : special_eog_ids) { for (const auto & id : special_eog_ids) {
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() ); LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
} }
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len); LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);

View File

@@ -2017,11 +2017,6 @@ struct server_context {
params_base.n_cache_reuse = 0; params_base.n_cache_reuse = 0;
SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled"); SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
} }
if (!params_base.speculative.model.path.empty()) {
SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
return false;
}
} }
return true; return true;
@@ -3222,7 +3217,7 @@ struct server_context {
} }
const auto n_swa = llama_model_n_swa(model); const auto n_swa = llama_model_n_swa(model);
if (pos_min > slot.n_past - n_swa) { if (pos_min > std::max(0, slot.n_past - n_swa)) {
SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa); SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n", SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
"https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055"); "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");