Spaces:
Sleeping
Sleeping
ggml : parallelize FP32 conversion when using BLAS (llama/5045)
Browse files* make GGML_TASK_INIT phase can be run in multithread
* multithreaded dequantize in mul_mat when using blas library
* minor fixes
* update outdated comment
* fix coding style
* simplify code
Co-authored-by: Georgi Gerganov <[email protected]>
---------
Co-authored-by: Georgi Gerganov <[email protected]>
ggml.c
CHANGED
|
@@ -7815,6 +7815,9 @@ static void ggml_compute_forward_acc_f32(
|
|
| 7815 |
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
| 7816 |
|
| 7817 |
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
|
|
|
|
|
|
|
|
|
| 7818 |
// memcpy needs to be synchronized across threads to avoid race conditions.
|
| 7819 |
// => do it in INIT phase
|
| 7820 |
memcpy(
|
|
@@ -9957,11 +9960,30 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9957 |
|
| 9958 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 9959 |
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
| 9960 |
-
|
| 9961 |
-
|
| 9962 |
-
|
| 9963 |
|
| 9964 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9965 |
return;
|
| 9966 |
}
|
| 9967 |
|
|
@@ -9969,9 +9991,14 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9969 |
return;
|
| 9970 |
}
|
| 9971 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9972 |
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
| 9973 |
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
| 9974 |
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
| 9975 |
const int64_t i03 = i13/r3;
|
| 9976 |
const int64_t i02 = i12/r2;
|
| 9977 |
|
|
@@ -9980,17 +10007,7 @@ static void ggml_compute_forward_mul_mat(
|
|
| 9980 |
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
| 9981 |
|
| 9982 |
if (type != GGML_TYPE_F32) {
|
| 9983 |
-
|
| 9984 |
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
| 9985 |
-
|
| 9986 |
-
size_t id = 0;
|
| 9987 |
-
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
| 9988 |
-
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
| 9989 |
-
id += ne00;
|
| 9990 |
-
}
|
| 9991 |
-
|
| 9992 |
-
assert(id*sizeof(float) <= params->wsize);
|
| 9993 |
-
x = wdata;
|
| 9994 |
}
|
| 9995 |
|
| 9996 |
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
@@ -10000,6 +10017,7 @@ static void ggml_compute_forward_mul_mat(
|
|
| 10000 |
0.0f, d, ne01);
|
| 10001 |
}
|
| 10002 |
}
|
|
|
|
| 10003 |
|
| 10004 |
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
| 10005 |
|
|
@@ -10008,6 +10026,9 @@ static void ggml_compute_forward_mul_mat(
|
|
| 10008 |
#endif
|
| 10009 |
|
| 10010 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 10011 |
if (src1->type != vec_dot_type) {
|
| 10012 |
char * wdata = params->wdata;
|
| 10013 |
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
@@ -10172,6 +10193,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|
| 10172 |
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
| 10173 |
|
| 10174 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 10175 |
char * wdata = params->wdata;
|
| 10176 |
if (src1->type != vec_dot_type) {
|
| 10177 |
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
@@ -10357,6 +10381,9 @@ static void ggml_compute_forward_out_prod_f32(
|
|
| 10357 |
return;
|
| 10358 |
}
|
| 10359 |
#endif
|
|
|
|
|
|
|
|
|
|
| 10360 |
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
| 10361 |
return;
|
| 10362 |
}
|
|
@@ -10540,6 +10567,9 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
| 10540 |
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
| 10541 |
|
| 10542 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 10543 |
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
| 10544 |
return;
|
| 10545 |
}
|
|
@@ -10724,6 +10754,9 @@ static void ggml_compute_forward_set_f32(
|
|
| 10724 |
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
| 10725 |
|
| 10726 |
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
|
|
|
|
|
|
|
|
|
| 10727 |
// memcpy needs to be synchronized across threads to avoid race conditions.
|
| 10728 |
// => do it in INIT phase
|
| 10729 |
memcpy(
|
|
@@ -11048,6 +11081,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
|
|
| 11048 |
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
| 11049 |
|
| 11050 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 11051 |
memset(dst->data, 0, ggml_nbytes(dst));
|
| 11052 |
}
|
| 11053 |
|
|
@@ -11082,6 +11118,9 @@ static void ggml_compute_forward_get_rows_back_f32(
|
|
| 11082 |
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
| 11083 |
|
| 11084 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 11085 |
memset(dst->data, 0, ggml_nbytes(dst));
|
| 11086 |
}
|
| 11087 |
|
|
@@ -11219,6 +11258,9 @@ static void ggml_compute_forward_diag_mask_f32(
|
|
| 11219 |
GGML_ASSERT(n_past >= 0);
|
| 11220 |
|
| 11221 |
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
|
|
|
|
|
|
|
|
|
| 11222 |
// memcpy needs to be synchronized across threads to avoid race conditions.
|
| 11223 |
// => do it in INIT phase
|
| 11224 |
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
@@ -12189,6 +12231,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|
| 12189 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 12190 |
|
| 12191 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 12192 |
memset(params->wdata, 0, params->wsize);
|
| 12193 |
|
| 12194 |
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
|
@@ -12283,6 +12328,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|
| 12283 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 12284 |
|
| 12285 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 12286 |
memset(params->wdata, 0, params->wsize);
|
| 12287 |
|
| 12288 |
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
|
@@ -12507,6 +12555,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|
| 12507 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 12508 |
|
| 12509 |
if (params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 12510 |
memset(params->wdata, 0, params->wsize);
|
| 12511 |
|
| 12512 |
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
|
@@ -14121,6 +14172,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
|
|
| 14121 |
|
| 14122 |
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
| 14123 |
if (!inplace && params->type == GGML_TASK_INIT) {
|
|
|
|
|
|
|
|
|
|
| 14124 |
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
| 14125 |
return;
|
| 14126 |
}
|
|
@@ -16414,8 +16468,9 @@ struct ggml_compute_state_shared {
|
|
| 16414 |
const int n_threads;
|
| 16415 |
|
| 16416 |
// synchronization primitives
|
| 16417 |
-
atomic_int n_active;
|
| 16418 |
-
atomic_int node_n;
|
|
|
|
| 16419 |
|
| 16420 |
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
| 16421 |
void * abort_callback_data;
|
|
@@ -16663,6 +16718,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
| 16663 |
return n_tasks;
|
| 16664 |
}
|
| 16665 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16666 |
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
| 16667 |
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
| 16668 |
|
|
@@ -16673,7 +16756,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 16673 |
|
| 16674 |
set_numa_thread_affinity(state->ith, n_threads);
|
| 16675 |
|
| 16676 |
-
int node_n
|
|
|
|
| 16677 |
|
| 16678 |
while (true) {
|
| 16679 |
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
|
@@ -16713,13 +16797,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 16713 |
|
| 16714 |
params.nth = n_tasks;
|
| 16715 |
|
| 16716 |
-
/* INIT */
|
| 16717 |
-
if (GGML_OP_HAS_INIT[node->op]) {
|
| 16718 |
-
params.type = GGML_TASK_INIT;
|
| 16719 |
-
ggml_compute_forward(¶ms, node);
|
| 16720 |
-
}
|
| 16721 |
-
|
| 16722 |
if (n_tasks == 1) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16723 |
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
| 16724 |
// they do something more efficient than spinning (?)
|
| 16725 |
params.type = GGML_TASK_COMPUTE;
|
|
@@ -16740,38 +16824,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 16740 |
}
|
| 16741 |
}
|
| 16742 |
|
| 16743 |
-
|
| 16744 |
-
atomic_store(&state->shared->
|
|
|
|
|
|
|
| 16745 |
} else {
|
| 16746 |
-
|
| 16747 |
-
|
| 16748 |
-
|
| 16749 |
-
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
| 16750 |
-
|
| 16751 |
-
while (true) {
|
| 16752 |
-
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
| 16753 |
-
// depending on the workload and the operating system.
|
| 16754 |
-
// since it is not clear what is the best approach, it should potentially become user-configurable
|
| 16755 |
-
// ref: https://github.com/ggerganov/ggml/issues/291
|
| 16756 |
-
// UPD: adding the do_yield flag seems to resolve the issue universally
|
| 16757 |
-
if (do_yield) {
|
| 16758 |
-
sched_yield();
|
| 16759 |
-
}
|
| 16760 |
-
|
| 16761 |
-
node_n = atomic_load(&state->shared->node_n);
|
| 16762 |
-
if (node_n != last) break;
|
| 16763 |
-
};
|
| 16764 |
}
|
| 16765 |
|
| 16766 |
// check if we should stop
|
| 16767 |
if (node_n >= cgraph->n_nodes) break;
|
| 16768 |
|
| 16769 |
-
/* COMPUTE */
|
| 16770 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 16771 |
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
| 16772 |
|
| 16773 |
struct ggml_compute_params params = {
|
| 16774 |
-
/*.type =*/
|
| 16775 |
/*.ith =*/ state->ith,
|
| 16776 |
/*.nth =*/ n_tasks,
|
| 16777 |
/*.wsize =*/ cplan->work_size,
|
|
@@ -16779,8 +16849,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
| 16779 |
};
|
| 16780 |
|
| 16781 |
if (state->ith < n_tasks) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16782 |
ggml_compute_forward(¶ms, node);
|
| 16783 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16784 |
}
|
| 16785 |
|
| 16786 |
return GGML_EXIT_SUCCESS;
|
|
@@ -16837,8 +16938,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
| 16837 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 16838 |
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
| 16839 |
if (node->src[0]->type != GGML_TYPE_F32) {
|
| 16840 |
-
// here we need memory
|
| 16841 |
-
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]
|
| 16842 |
}
|
| 16843 |
} else
|
| 16844 |
#endif
|
|
@@ -16992,6 +17093,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
| 16992 |
/*.n_threads =*/ n_threads,
|
| 16993 |
/*.n_active =*/ n_threads,
|
| 16994 |
/*.node_n =*/ -1,
|
|
|
|
| 16995 |
/*.abort_callback =*/ NULL,
|
| 16996 |
/*.abort_callback_data =*/ NULL,
|
| 16997 |
};
|
|
|
|
| 7815 |
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
| 7816 |
|
| 7817 |
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
| 7818 |
+
if (params->ith != 0) {
|
| 7819 |
+
return;
|
| 7820 |
+
}
|
| 7821 |
// memcpy needs to be synchronized across threads to avoid race conditions.
|
| 7822 |
// => do it in INIT phase
|
| 7823 |
memcpy(
|
|
|
|
| 9960 |
|
| 9961 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 9962 |
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
| 9963 |
+
const int64_t ne_plane = ne01*ne00;
|
| 9964 |
+
const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
| 9965 |
+
UNUSED(desired_wsize);
|
| 9966 |
|
| 9967 |
if (params->type == GGML_TASK_INIT) {
|
| 9968 |
+
if (type != GGML_TYPE_F32) {
|
| 9969 |
+
assert(params->wsize >= desired_wsize);
|
| 9970 |
+
// parallelize by src0 rows
|
| 9971 |
+
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
| 9972 |
+
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
| 9973 |
+
// broadcast src0 into src1 across 2nd,3rd dimension
|
| 9974 |
+
const int64_t i03 = i13/r3;
|
| 9975 |
+
const int64_t i02 = i12/r2;
|
| 9976 |
+
|
| 9977 |
+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
| 9978 |
+
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
| 9979 |
+
ggml_to_float_t const to_float = type_traits[type].to_float;
|
| 9980 |
+
|
| 9981 |
+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
| 9982 |
+
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
| 9983 |
+
}
|
| 9984 |
+
}
|
| 9985 |
+
}
|
| 9986 |
+
}
|
| 9987 |
return;
|
| 9988 |
}
|
| 9989 |
|
|
|
|
| 9991 |
return;
|
| 9992 |
}
|
| 9993 |
|
| 9994 |
+
// perform sgemm, parallelization controlled by blas lib
|
| 9995 |
+
if (ith != 0) {
|
| 9996 |
+
return;
|
| 9997 |
+
}
|
| 9998 |
+
|
| 9999 |
+
const int64_t tgemm0 = ggml_perf_time_us();
|
| 10000 |
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
| 10001 |
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
|
|
| 10002 |
const int64_t i03 = i13/r3;
|
| 10003 |
const int64_t i02 = i12/r2;
|
| 10004 |
|
|
|
|
| 10007 |
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
| 10008 |
|
| 10009 |
if (type != GGML_TYPE_F32) {
|
| 10010 |
+
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10011 |
}
|
| 10012 |
|
| 10013 |
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
|
|
| 10017 |
0.0f, d, ne01);
|
| 10018 |
}
|
| 10019 |
}
|
| 10020 |
+
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
| 10021 |
|
| 10022 |
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
| 10023 |
|
|
|
|
| 10026 |
#endif
|
| 10027 |
|
| 10028 |
if (params->type == GGML_TASK_INIT) {
|
| 10029 |
+
if (ith != 0) {
|
| 10030 |
+
return;
|
| 10031 |
+
}
|
| 10032 |
if (src1->type != vec_dot_type) {
|
| 10033 |
char * wdata = params->wdata;
|
| 10034 |
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
|
|
| 10193 |
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
| 10194 |
|
| 10195 |
if (params->type == GGML_TASK_INIT) {
|
| 10196 |
+
if (ith != 0) {
|
| 10197 |
+
return;
|
| 10198 |
+
}
|
| 10199 |
char * wdata = params->wdata;
|
| 10200 |
if (src1->type != vec_dot_type) {
|
| 10201 |
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
|
|
| 10381 |
return;
|
| 10382 |
}
|
| 10383 |
#endif
|
| 10384 |
+
if (ith != 0) {
|
| 10385 |
+
return;
|
| 10386 |
+
}
|
| 10387 |
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
| 10388 |
return;
|
| 10389 |
}
|
|
|
|
| 10567 |
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
| 10568 |
|
| 10569 |
if (params->type == GGML_TASK_INIT) {
|
| 10570 |
+
if (ith != 0) {
|
| 10571 |
+
return;
|
| 10572 |
+
}
|
| 10573 |
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
| 10574 |
return;
|
| 10575 |
}
|
|
|
|
| 10754 |
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
| 10755 |
|
| 10756 |
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
| 10757 |
+
if (params->ith != 0) {
|
| 10758 |
+
return;
|
| 10759 |
+
}
|
| 10760 |
// memcpy needs to be synchronized across threads to avoid race conditions.
|
| 10761 |
// => do it in INIT phase
|
| 10762 |
memcpy(
|
|
|
|
| 11081 |
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
| 11082 |
|
| 11083 |
if (params->type == GGML_TASK_INIT) {
|
| 11084 |
+
if (params->ith != 0) {
|
| 11085 |
+
return;
|
| 11086 |
+
}
|
| 11087 |
memset(dst->data, 0, ggml_nbytes(dst));
|
| 11088 |
}
|
| 11089 |
|
|
|
|
| 11118 |
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
| 11119 |
|
| 11120 |
if (params->type == GGML_TASK_INIT) {
|
| 11121 |
+
if (params->ith != 0) {
|
| 11122 |
+
return;
|
| 11123 |
+
}
|
| 11124 |
memset(dst->data, 0, ggml_nbytes(dst));
|
| 11125 |
}
|
| 11126 |
|
|
|
|
| 11258 |
GGML_ASSERT(n_past >= 0);
|
| 11259 |
|
| 11260 |
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
| 11261 |
+
if (ith != 0) {
|
| 11262 |
+
return;
|
| 11263 |
+
}
|
| 11264 |
// memcpy needs to be synchronized across threads to avoid race conditions.
|
| 11265 |
// => do it in INIT phase
|
| 11266 |
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
|
|
| 12231 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 12232 |
|
| 12233 |
if (params->type == GGML_TASK_INIT) {
|
| 12234 |
+
if (ith != 0) {
|
| 12235 |
+
return;
|
| 12236 |
+
}
|
| 12237 |
memset(params->wdata, 0, params->wsize);
|
| 12238 |
|
| 12239 |
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
|
|
|
| 12328 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 12329 |
|
| 12330 |
if (params->type == GGML_TASK_INIT) {
|
| 12331 |
+
if (ith != 0) {
|
| 12332 |
+
return;
|
| 12333 |
+
}
|
| 12334 |
memset(params->wdata, 0, params->wsize);
|
| 12335 |
|
| 12336 |
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
|
|
|
| 12555 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 12556 |
|
| 12557 |
if (params->type == GGML_TASK_INIT) {
|
| 12558 |
+
if (ith != 0) {
|
| 12559 |
+
return;
|
| 12560 |
+
}
|
| 12561 |
memset(params->wdata, 0, params->wsize);
|
| 12562 |
|
| 12563 |
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
|
|
|
| 14172 |
|
| 14173 |
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
| 14174 |
if (!inplace && params->type == GGML_TASK_INIT) {
|
| 14175 |
+
if (params->ith != 0) {
|
| 14176 |
+
return;
|
| 14177 |
+
}
|
| 14178 |
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
| 14179 |
return;
|
| 14180 |
}
|
|
|
|
| 16468 |
const int n_threads;
|
| 16469 |
|
| 16470 |
// synchronization primitives
|
| 16471 |
+
atomic_int n_active; // num active threads
|
| 16472 |
+
atomic_int node_n; // active graph node
|
| 16473 |
+
atomic_int node_task; // active graph node task phase
|
| 16474 |
|
| 16475 |
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
| 16476 |
void * abort_callback_data;
|
|
|
|
| 16718 |
return n_tasks;
|
| 16719 |
}
|
| 16720 |
|
| 16721 |
+
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
|
| 16722 |
+
// wait for other threads to finish
|
| 16723 |
+
const int last_node_n = * node_n;
|
| 16724 |
+
|
| 16725 |
+
while (true) {
|
| 16726 |
+
if (do_yield) {
|
| 16727 |
+
sched_yield();
|
| 16728 |
+
}
|
| 16729 |
+
|
| 16730 |
+
* node_n = atomic_load(&state->shared->node_n);
|
| 16731 |
+
if (* node_n != last_node_n) break;
|
| 16732 |
+
}
|
| 16733 |
+
}
|
| 16734 |
+
|
| 16735 |
+
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
| 16736 |
+
// wait for other threads to finish
|
| 16737 |
+
const int last_task_phase = * task_phase;
|
| 16738 |
+
|
| 16739 |
+
while (true) {
|
| 16740 |
+
if (do_yield) {
|
| 16741 |
+
sched_yield();
|
| 16742 |
+
}
|
| 16743 |
+
|
| 16744 |
+
* task_phase = atomic_load(&state->shared->node_task);
|
| 16745 |
+
if (* task_phase != last_task_phase) break;
|
| 16746 |
+
}
|
| 16747 |
+
}
|
| 16748 |
+
|
| 16749 |
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
| 16750 |
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
| 16751 |
|
|
|
|
| 16756 |
|
| 16757 |
set_numa_thread_affinity(state->ith, n_threads);
|
| 16758 |
|
| 16759 |
+
int node_n = -1;
|
| 16760 |
+
int task_phase = GGML_TASK_FINALIZE;
|
| 16761 |
|
| 16762 |
while (true) {
|
| 16763 |
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
|
|
|
| 16797 |
|
| 16798 |
params.nth = n_tasks;
|
| 16799 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16800 |
if (n_tasks == 1) {
|
| 16801 |
+
/* INIT */
|
| 16802 |
+
if (GGML_OP_HAS_INIT[node->op]) {
|
| 16803 |
+
params.type = GGML_TASK_INIT;
|
| 16804 |
+
ggml_compute_forward(¶ms, node);
|
| 16805 |
+
}
|
| 16806 |
+
|
| 16807 |
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
| 16808 |
// they do something more efficient than spinning (?)
|
| 16809 |
params.type = GGML_TASK_COMPUTE;
|
|
|
|
| 16824 |
}
|
| 16825 |
}
|
| 16826 |
|
| 16827 |
+
task_phase = GGML_TASK_INIT;
|
| 16828 |
+
atomic_store(&state->shared->n_active, n_threads);
|
| 16829 |
+
atomic_store(&state->shared->node_n, node_n);
|
| 16830 |
+
atomic_store(&state->shared->node_task, task_phase);
|
| 16831 |
} else {
|
| 16832 |
+
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
| 16833 |
+
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16834 |
}
|
| 16835 |
|
| 16836 |
// check if we should stop
|
| 16837 |
if (node_n >= cgraph->n_nodes) break;
|
| 16838 |
|
| 16839 |
+
/* INIT & COMPUTE */
|
| 16840 |
struct ggml_tensor * node = cgraph->nodes[node_n];
|
| 16841 |
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
| 16842 |
|
| 16843 |
struct ggml_compute_params params = {
|
| 16844 |
+
/*.type =*/ GGML_TASK_INIT,
|
| 16845 |
/*.ith =*/ state->ith,
|
| 16846 |
/*.nth =*/ n_tasks,
|
| 16847 |
/*.wsize =*/ cplan->work_size,
|
|
|
|
| 16849 |
};
|
| 16850 |
|
| 16851 |
if (state->ith < n_tasks) {
|
| 16852 |
+
if (GGML_OP_HAS_INIT[node->op]) {
|
| 16853 |
+
ggml_compute_forward(¶ms, node);
|
| 16854 |
+
}
|
| 16855 |
+
}
|
| 16856 |
+
|
| 16857 |
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
| 16858 |
+
task_phase = GGML_TASK_COMPUTE;
|
| 16859 |
+
atomic_store(&state->shared->n_active, n_threads);
|
| 16860 |
+
atomic_store(&state->shared->node_task, task_phase);
|
| 16861 |
+
}
|
| 16862 |
+
else {
|
| 16863 |
+
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
| 16864 |
+
// depending on the workload and the operating system.
|
| 16865 |
+
// since it is not clear what is the best approach, it should potentially become user-configurable
|
| 16866 |
+
// ref: https://github.com/ggerganov/ggml/issues/291
|
| 16867 |
+
// UPD: adding the do_yield flag seems to resolve the issue universally
|
| 16868 |
+
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
| 16869 |
+
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
| 16870 |
+
}
|
| 16871 |
+
|
| 16872 |
+
if (state->ith < n_tasks) {
|
| 16873 |
+
params.type = GGML_TASK_COMPUTE;
|
| 16874 |
ggml_compute_forward(¶ms, node);
|
| 16875 |
}
|
| 16876 |
+
|
| 16877 |
+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
| 16878 |
+
task_phase = GGML_TASK_FINALIZE;
|
| 16879 |
+
atomic_store(&state->shared->n_active, n_threads);
|
| 16880 |
+
atomic_store(&state->shared->node_task, task_phase);
|
| 16881 |
+
}
|
| 16882 |
+
else {
|
| 16883 |
+
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
| 16884 |
+
}
|
| 16885 |
}
|
| 16886 |
|
| 16887 |
return GGML_EXIT_SUCCESS;
|
|
|
|
| 16938 |
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 16939 |
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
| 16940 |
if (node->src[0]->type != GGML_TYPE_F32) {
|
| 16941 |
+
// here we need memory for fully dequantized matrix from src0
|
| 16942 |
+
cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
|
| 16943 |
}
|
| 16944 |
} else
|
| 16945 |
#endif
|
|
|
|
| 17093 |
/*.n_threads =*/ n_threads,
|
| 17094 |
/*.n_active =*/ n_threads,
|
| 17095 |
/*.node_n =*/ -1,
|
| 17096 |
+
/*.node_task =*/ GGML_TASK_FINALIZE,
|
| 17097 |
/*.abort_callback =*/ NULL,
|
| 17098 |
/*.abort_callback_data =*/ NULL,
|
| 17099 |
};
|