mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	imatrix : fix 3d activation handling for hybrid and recurrent models (#14994)
* imatrix : use a single count for dense 3d tensors * imatrix : fix 3d activations when model tensor is 2d * imatrix : fix 3d tensor counts
This commit is contained in:
		| @@ -250,13 +250,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | ||||
|     const char * data = is_host ? (const char *) src1->data : m_src1_data.data(); | ||||
|     GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); | ||||
|  | ||||
|     // TODO: 4d? (is that even used in practice?) | ||||
|     // the extra dimension would need to be stored somewhere to be reflected in the imatrix file | ||||
|     if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { | ||||
|         LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); | ||||
|         GGML_ASSERT(false); | ||||
|     } | ||||
|  | ||||
|     // this has been adapted to the new format of storing merged experts in a single 3d tensor | ||||
|     // ref: https://github.com/ggml-org/llama.cpp/pull/6387 | ||||
|     if (t->op == GGML_OP_MUL_MAT_ID) { | ||||
| @@ -272,6 +265,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | ||||
|  | ||||
|         GGML_ASSERT(ids->ne[1] == src1->ne[2]); | ||||
|  | ||||
|         // the extra dimension would need to be stored somewhere to be reflected in the imatrix file | ||||
|         if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) { | ||||
|             LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str()); | ||||
|             GGML_ASSERT(false); | ||||
|         } | ||||
|  | ||||
|         m_ids.resize(ggml_nbytes(ids)); | ||||
|         ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); | ||||
|  | ||||
| @@ -335,29 +334,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | ||||
|         } | ||||
|     } else { | ||||
|         auto & e = m_stats[wname]; | ||||
|         const int64_t n_mat = src1->ne[2] * src1->ne[3]; | ||||
|         const int64_t n_mat = src0->ne[2] * src0->ne[3]; | ||||
|  | ||||
|         // use a single count per dense tensor | ||||
|         // (necessary when merging older GGUF-imatrix files with 3d tensors) | ||||
|         if (e.counts.size() > 1) { | ||||
|             bool all_equal = true; | ||||
|             for (size_t i = 1; i < e.counts.size(); ++i) { | ||||
|                 if (e.counts[0] != e.counts[i]) { | ||||
|                     all_equal = false; | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|             if (all_equal) { | ||||
|                 e.counts.resize(1); | ||||
|             } | ||||
|         } | ||||
|         if (e.values.empty()) { | ||||
|             e.values.resize(src1->ne[0] * n_mat, 0); | ||||
|             e.counts.resize(n_mat, 0); | ||||
|             e.counts.resize(1, 0); | ||||
|         } | ||||
|         else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) { | ||||
|             LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat)); | ||||
|             exit(1); //GGML_ABORT("fatal error"); | ||||
|         } | ||||
|         else if (e.counts.size() != (size_t)n_mat) { | ||||
|             LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat); | ||||
|             exit(1); //GGML_ABORT("fatal error"); | ||||
|         } | ||||
|         LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type); | ||||
|  | ||||
|         for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) { | ||||
|             for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) { | ||||
|                 const int64_t mat_id = i3 * src1->ne[2] + i2; | ||||
|                 // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D | ||||
|                 const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]); | ||||
|                 const int64_t mat_start = mat_id * src1->ne[0]; | ||||
|  | ||||
|                 for (int64_t row = 0; row < src1->ne[1]; ++row) { | ||||
|                     const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]); | ||||
|                     e.counts[mat_id]++; | ||||
|                     const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]); | ||||
|                     for (int64_t j = 0; j < src1->ne[0]; ++j) { | ||||
|                         e.values[mat_start + j] += x[j] * x[j]; | ||||
|                         if (!std::isfinite((float)e.values[j])) { | ||||
| @@ -366,16 +376,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 const int32_t n_chunk = e.counts[mat_id] / chunk_size; | ||||
|                 if (n_chunk > m_last_chunk) { | ||||
|                     const int32_t chunk_step = n_chunk - m_last_chunk; | ||||
|                     m_last_chunk = n_chunk; | ||||
|                     if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { | ||||
|                         save_imatrix(); | ||||
|                     } | ||||
|                     if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { | ||||
|                         save_imatrix(m_last_chunk); | ||||
|                     } | ||||
|             } | ||||
|         } | ||||
|         // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT | ||||
|         for (size_t i = 0; i < e.counts.size(); ++i) { | ||||
|             e.counts[i] += ggml_nrows(src1) / n_mat; | ||||
|             const int32_t n_chunk = e.counts[i] / chunk_size; | ||||
|             if (n_chunk > m_last_chunk) { | ||||
|                 const int32_t chunk_step = n_chunk - m_last_chunk; | ||||
|                 m_last_chunk = n_chunk; | ||||
|                 if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) { | ||||
|                     save_imatrix(); | ||||
|                 } | ||||
|                 if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) { | ||||
|                     save_imatrix(m_last_chunk); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 compilade
					compilade