reinforce20001 ggerganov commited on
Commit
7bf2c87
·
unverified ·
1 Parent(s): dc8f956

ggml : parallelize FP32 conversion when using BLAS (llama/5045)

Browse files

* make GGML_TASK_INIT phase can be run in multithread

* multithreaded dequantize in mul_mat when using blas library

* minor fixes

* update outdated comment
* fix coding style

* simplify code

Co-authored-by: Georgi Gerganov <[email protected]>

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (1) hide show
  1. ggml.c +150 -48
ggml.c CHANGED
@@ -7815,6 +7815,9 @@ static void ggml_compute_forward_acc_f32(
7815
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
7816
 
7817
  if (!inplace && (params->type == GGML_TASK_INIT)) {
 
 
 
7818
  // memcpy needs to be synchronized across threads to avoid race conditions.
7819
  // => do it in INIT phase
7820
  memcpy(
@@ -9957,11 +9960,30 @@ static void ggml_compute_forward_mul_mat(
9957
 
9958
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9959
  if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9960
- if (params->ith != 0) {
9961
- return;
9962
- }
9963
 
9964
  if (params->type == GGML_TASK_INIT) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9965
  return;
9966
  }
9967
 
@@ -9969,9 +9991,14 @@ static void ggml_compute_forward_mul_mat(
9969
  return;
9970
  }
9971
 
 
 
 
 
 
 
9972
  for (int64_t i13 = 0; i13 < ne13; i13++) {
9973
  for (int64_t i12 = 0; i12 < ne12; i12++) {
9974
- // broadcast src0 into src1 across 2nd,3rd dimension
9975
  const int64_t i03 = i13/r3;
9976
  const int64_t i02 = i12/r2;
9977
 
@@ -9980,17 +10007,7 @@ static void ggml_compute_forward_mul_mat(
9980
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9981
 
9982
  if (type != GGML_TYPE_F32) {
9983
- float * const wdata = params->wdata;
9984
- ggml_to_float_t const to_float = type_traits[type].to_float;
9985
-
9986
- size_t id = 0;
9987
- for (int64_t i01 = 0; i01 < ne01; ++i01) {
9988
- to_float((const char *) x + i01*nb01, wdata + id, ne00);
9989
- id += ne00;
9990
- }
9991
-
9992
- assert(id*sizeof(float) <= params->wsize);
9993
- x = wdata;
9994
  }
9995
 
9996
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -10000,6 +10017,7 @@ static void ggml_compute_forward_mul_mat(
10000
  0.0f, d, ne01);
10001
  }
10002
  }
 
10003
 
10004
  //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
10005
 
@@ -10008,6 +10026,9 @@ static void ggml_compute_forward_mul_mat(
10008
  #endif
10009
 
10010
  if (params->type == GGML_TASK_INIT) {
 
 
 
10011
  if (src1->type != vec_dot_type) {
10012
  char * wdata = params->wdata;
10013
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10172,6 +10193,9 @@ static void ggml_compute_forward_mul_mat_id(
10172
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10173
 
10174
  if (params->type == GGML_TASK_INIT) {
 
 
 
10175
  char * wdata = params->wdata;
10176
  if (src1->type != vec_dot_type) {
10177
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10357,6 +10381,9 @@ static void ggml_compute_forward_out_prod_f32(
10357
  return;
10358
  }
10359
  #endif
 
 
 
10360
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10361
  return;
10362
  }
@@ -10540,6 +10567,9 @@ static void ggml_compute_forward_out_prod_q_f32(
10540
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10541
 
10542
  if (params->type == GGML_TASK_INIT) {
 
 
 
10543
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10544
  return;
10545
  }
@@ -10724,6 +10754,9 @@ static void ggml_compute_forward_set_f32(
10724
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
10725
 
10726
  if (!inplace && (params->type == GGML_TASK_INIT)) {
 
 
 
10727
  // memcpy needs to be synchronized across threads to avoid race conditions.
10728
  // => do it in INIT phase
10729
  memcpy(
@@ -11048,6 +11081,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11048
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11049
 
11050
  if (params->type == GGML_TASK_INIT) {
 
 
 
11051
  memset(dst->data, 0, ggml_nbytes(dst));
11052
  }
11053
 
@@ -11082,6 +11118,9 @@ static void ggml_compute_forward_get_rows_back_f32(
11082
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11083
 
11084
  if (params->type == GGML_TASK_INIT) {
 
 
 
11085
  memset(dst->data, 0, ggml_nbytes(dst));
11086
  }
11087
 
@@ -11219,6 +11258,9 @@ static void ggml_compute_forward_diag_mask_f32(
11219
  GGML_ASSERT(n_past >= 0);
11220
 
11221
  if (!inplace && (params->type == GGML_TASK_INIT)) {
 
 
 
11222
  // memcpy needs to be synchronized across threads to avoid race conditions.
11223
  // => do it in INIT phase
11224
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -12189,6 +12231,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12189
  GGML_ASSERT(nb10 == sizeof(float));
12190
 
12191
  if (params->type == GGML_TASK_INIT) {
 
 
 
12192
  memset(params->wdata, 0, params->wsize);
12193
 
12194
  // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12283,6 +12328,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12283
  GGML_ASSERT(nb10 == sizeof(float));
12284
 
12285
  if (params->type == GGML_TASK_INIT) {
 
 
 
12286
  memset(params->wdata, 0, params->wsize);
12287
 
12288
  // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12507,6 +12555,9 @@ static void ggml_compute_forward_conv_transpose_2d(
12507
  GGML_ASSERT(nb10 == sizeof(float));
12508
 
12509
  if (params->type == GGML_TASK_INIT) {
 
 
 
12510
  memset(params->wdata, 0, params->wsize);
12511
 
12512
  // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -14121,6 +14172,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
14121
 
14122
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14123
  if (!inplace && params->type == GGML_TASK_INIT) {
 
 
 
14124
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14125
  return;
14126
  }
@@ -16414,8 +16468,9 @@ struct ggml_compute_state_shared {
16414
  const int n_threads;
16415
 
16416
  // synchronization primitives
16417
- atomic_int n_active; // num active threads
16418
- atomic_int node_n; // active graph node
 
16419
 
16420
  bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16421
  void * abort_callback_data;
@@ -16663,6 +16718,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16663
  return n_tasks;
16664
  }
16665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16666
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16667
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16668
 
@@ -16673,7 +16756,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16673
 
16674
  set_numa_thread_affinity(state->ith, n_threads);
16675
 
16676
- int node_n = -1;
 
16677
 
16678
  while (true) {
16679
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16713,13 +16797,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16713
 
16714
  params.nth = n_tasks;
16715
 
16716
- /* INIT */
16717
- if (GGML_OP_HAS_INIT[node->op]) {
16718
- params.type = GGML_TASK_INIT;
16719
- ggml_compute_forward(&params, node);
16720
- }
16721
-
16722
  if (n_tasks == 1) {
 
 
 
 
 
 
16723
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16724
  // they do something more efficient than spinning (?)
16725
  params.type = GGML_TASK_COMPUTE;
@@ -16740,38 +16824,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16740
  }
16741
  }
16742
 
16743
- atomic_store(&state->shared->n_active, n_threads);
16744
- atomic_store(&state->shared->node_n, node_n);
 
 
16745
  } else {
16746
- // wait for other threads to finish
16747
- const int last = node_n;
16748
-
16749
- const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16750
-
16751
- while (true) {
16752
- // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16753
- // depending on the workload and the operating system.
16754
- // since it is not clear what is the best approach, it should potentially become user-configurable
16755
- // ref: https://github.com/ggerganov/ggml/issues/291
16756
- // UPD: adding the do_yield flag seems to resolve the issue universally
16757
- if (do_yield) {
16758
- sched_yield();
16759
- }
16760
-
16761
- node_n = atomic_load(&state->shared->node_n);
16762
- if (node_n != last) break;
16763
- };
16764
  }
16765
 
16766
  // check if we should stop
16767
  if (node_n >= cgraph->n_nodes) break;
16768
 
16769
- /* COMPUTE */
16770
  struct ggml_tensor * node = cgraph->nodes[node_n];
16771
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16772
 
16773
  struct ggml_compute_params params = {
16774
- /*.type =*/ GGML_TASK_COMPUTE,
16775
  /*.ith =*/ state->ith,
16776
  /*.nth =*/ n_tasks,
16777
  /*.wsize =*/ cplan->work_size,
@@ -16779,8 +16849,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16779
  };
16780
 
16781
  if (state->ith < n_tasks) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16782
  ggml_compute_forward(&params, node);
16783
  }
 
 
 
 
 
 
 
 
 
16784
  }
16785
 
16786
  return GGML_EXIT_SUCCESS;
@@ -16837,8 +16938,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16837
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16838
  if (ggml_compute_forward_mul_mat_use_blas(node)) {
16839
  if (node->src[0]->type != GGML_TYPE_F32) {
16840
- // here we need memory just for single 2D matrix from src0
16841
- cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
16842
  }
16843
  } else
16844
  #endif
@@ -16992,6 +17093,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16992
  /*.n_threads =*/ n_threads,
16993
  /*.n_active =*/ n_threads,
16994
  /*.node_n =*/ -1,
 
16995
  /*.abort_callback =*/ NULL,
16996
  /*.abort_callback_data =*/ NULL,
16997
  };
 
7815
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
7816
 
7817
  if (!inplace && (params->type == GGML_TASK_INIT)) {
7818
+ if (params->ith != 0) {
7819
+ return;
7820
+ }
7821
  // memcpy needs to be synchronized across threads to avoid race conditions.
7822
  // => do it in INIT phase
7823
  memcpy(
 
9960
 
9961
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9962
  if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9963
+ const int64_t ne_plane = ne01*ne00;
9964
+ const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
9965
+ UNUSED(desired_wsize);
9966
 
9967
  if (params->type == GGML_TASK_INIT) {
9968
+ if (type != GGML_TYPE_F32) {
9969
+ assert(params->wsize >= desired_wsize);
9970
+ // parallelize by src0 rows
9971
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
9972
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
9973
+ // broadcast src0 into src1 across 2nd,3rd dimension
9974
+ const int64_t i03 = i13/r3;
9975
+ const int64_t i02 = i12/r2;
9976
+
9977
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9978
+ float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
9979
+ ggml_to_float_t const to_float = type_traits[type].to_float;
9980
+
9981
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
9982
+ to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
9983
+ }
9984
+ }
9985
+ }
9986
+ }
9987
  return;
9988
  }
9989
 
 
9991
  return;
9992
  }
9993
 
9994
+ // perform sgemm, parallelization controlled by blas lib
9995
+ if (ith != 0) {
9996
+ return;
9997
+ }
9998
+
9999
+ const int64_t tgemm0 = ggml_perf_time_us();
10000
  for (int64_t i13 = 0; i13 < ne13; i13++) {
10001
  for (int64_t i12 = 0; i12 < ne12; i12++) {
 
10002
  const int64_t i03 = i13/r3;
10003
  const int64_t i02 = i12/r2;
10004
 
 
10007
  float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
10008
 
10009
  if (type != GGML_TYPE_F32) {
10010
+ x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
 
 
 
 
 
 
 
 
 
 
10011
  }
10012
 
10013
  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
 
10017
  0.0f, d, ne01);
10018
  }
10019
  }
10020
+ //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
10021
 
10022
  //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
10023
 
 
10026
  #endif
10027
 
10028
  if (params->type == GGML_TASK_INIT) {
10029
+ if (ith != 0) {
10030
+ return;
10031
+ }
10032
  if (src1->type != vec_dot_type) {
10033
  char * wdata = params->wdata;
10034
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
10193
  #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10194
 
10195
  if (params->type == GGML_TASK_INIT) {
10196
+ if (ith != 0) {
10197
+ return;
10198
+ }
10199
  char * wdata = params->wdata;
10200
  if (src1->type != vec_dot_type) {
10201
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
10381
  return;
10382
  }
10383
  #endif
10384
+ if (ith != 0) {
10385
+ return;
10386
+ }
10387
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10388
  return;
10389
  }
 
10567
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10568
 
10569
  if (params->type == GGML_TASK_INIT) {
10570
+ if (ith != 0) {
10571
+ return;
10572
+ }
10573
  ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10574
  return;
10575
  }
 
10754
  bool inplace = (bool) ((int32_t *) dst->op_params)[4];
10755
 
10756
  if (!inplace && (params->type == GGML_TASK_INIT)) {
10757
+ if (params->ith != 0) {
10758
+ return;
10759
+ }
10760
  // memcpy needs to be synchronized across threads to avoid race conditions.
10761
  // => do it in INIT phase
10762
  memcpy(
 
11081
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11082
 
11083
  if (params->type == GGML_TASK_INIT) {
11084
+ if (params->ith != 0) {
11085
+ return;
11086
+ }
11087
  memset(dst->data, 0, ggml_nbytes(dst));
11088
  }
11089
 
 
11118
  // ggml_compute_forward_dup_same_cont(params, opt0, dst);
11119
 
11120
  if (params->type == GGML_TASK_INIT) {
11121
+ if (params->ith != 0) {
11122
+ return;
11123
+ }
11124
  memset(dst->data, 0, ggml_nbytes(dst));
11125
  }
11126
 
 
11258
  GGML_ASSERT(n_past >= 0);
11259
 
11260
  if (!inplace && (params->type == GGML_TASK_INIT)) {
11261
+ if (ith != 0) {
11262
+ return;
11263
+ }
11264
  // memcpy needs to be synchronized across threads to avoid race conditions.
11265
  // => do it in INIT phase
11266
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
 
12231
  GGML_ASSERT(nb10 == sizeof(float));
12232
 
12233
  if (params->type == GGML_TASK_INIT) {
12234
+ if (ith != 0) {
12235
+ return;
12236
+ }
12237
  memset(params->wdata, 0, params->wsize);
12238
 
12239
  // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
 
12328
  GGML_ASSERT(nb10 == sizeof(float));
12329
 
12330
  if (params->type == GGML_TASK_INIT) {
12331
+ if (ith != 0) {
12332
+ return;
12333
+ }
12334
  memset(params->wdata, 0, params->wsize);
12335
 
12336
  // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
 
12555
  GGML_ASSERT(nb10 == sizeof(float));
12556
 
12557
  if (params->type == GGML_TASK_INIT) {
12558
+ if (ith != 0) {
12559
+ return;
12560
+ }
12561
  memset(params->wdata, 0, params->wsize);
12562
 
12563
  // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
 
14172
 
14173
  const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14174
  if (!inplace && params->type == GGML_TASK_INIT) {
14175
+ if (params->ith != 0) {
14176
+ return;
14177
+ }
14178
  memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14179
  return;
14180
  }
 
16468
  const int n_threads;
16469
 
16470
  // synchronization primitives
16471
+ atomic_int n_active; // num active threads
16472
+ atomic_int node_n; // active graph node
16473
+ atomic_int node_task; // active graph node task phase
16474
 
16475
  bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16476
  void * abort_callback_data;
 
16718
  return n_tasks;
16719
  }
16720
 
16721
+ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
16722
+ // wait for other threads to finish
16723
+ const int last_node_n = * node_n;
16724
+
16725
+ while (true) {
16726
+ if (do_yield) {
16727
+ sched_yield();
16728
+ }
16729
+
16730
+ * node_n = atomic_load(&state->shared->node_n);
16731
+ if (* node_n != last_node_n) break;
16732
+ }
16733
+ }
16734
+
16735
+ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
16736
+ // wait for other threads to finish
16737
+ const int last_task_phase = * task_phase;
16738
+
16739
+ while (true) {
16740
+ if (do_yield) {
16741
+ sched_yield();
16742
+ }
16743
+
16744
+ * task_phase = atomic_load(&state->shared->node_task);
16745
+ if (* task_phase != last_task_phase) break;
16746
+ }
16747
+ }
16748
+
16749
  static thread_ret_t ggml_graph_compute_thread(void * data) {
16750
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16751
 
 
16756
 
16757
  set_numa_thread_affinity(state->ith, n_threads);
16758
 
16759
+ int node_n = -1;
16760
+ int task_phase = GGML_TASK_FINALIZE;
16761
 
16762
  while (true) {
16763
  if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
 
16797
 
16798
  params.nth = n_tasks;
16799
 
 
 
 
 
 
 
16800
  if (n_tasks == 1) {
16801
+ /* INIT */
16802
+ if (GGML_OP_HAS_INIT[node->op]) {
16803
+ params.type = GGML_TASK_INIT;
16804
+ ggml_compute_forward(&params, node);
16805
+ }
16806
+
16807
  // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16808
  // they do something more efficient than spinning (?)
16809
  params.type = GGML_TASK_COMPUTE;
 
16824
  }
16825
  }
16826
 
16827
+ task_phase = GGML_TASK_INIT;
16828
+ atomic_store(&state->shared->n_active, n_threads);
16829
+ atomic_store(&state->shared->node_n, node_n);
16830
+ atomic_store(&state->shared->node_task, task_phase);
16831
  } else {
16832
+ ggml_graph_compute_thread_sync_node(&node_n, state, false);
16833
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16834
  }
16835
 
16836
  // check if we should stop
16837
  if (node_n >= cgraph->n_nodes) break;
16838
 
16839
+ /* INIT & COMPUTE */
16840
  struct ggml_tensor * node = cgraph->nodes[node_n];
16841
  const int n_tasks = ggml_get_n_tasks(node, n_threads);
16842
 
16843
  struct ggml_compute_params params = {
16844
+ /*.type =*/ GGML_TASK_INIT,
16845
  /*.ith =*/ state->ith,
16846
  /*.nth =*/ n_tasks,
16847
  /*.wsize =*/ cplan->work_size,
 
16849
  };
16850
 
16851
  if (state->ith < n_tasks) {
16852
+ if (GGML_OP_HAS_INIT[node->op]) {
16853
+ ggml_compute_forward(&params, node);
16854
+ }
16855
+ }
16856
+
16857
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16858
+ task_phase = GGML_TASK_COMPUTE;
16859
+ atomic_store(&state->shared->n_active, n_threads);
16860
+ atomic_store(&state->shared->node_task, task_phase);
16861
+ }
16862
+ else {
16863
+ // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16864
+ // depending on the workload and the operating system.
16865
+ // since it is not clear what is the best approach, it should potentially become user-configurable
16866
+ // ref: https://github.com/ggerganov/ggml/issues/291
16867
+ // UPD: adding the do_yield flag seems to resolve the issue universally
16868
+ const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
16869
+ ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
16870
+ }
16871
+
16872
+ if (state->ith < n_tasks) {
16873
+ params.type = GGML_TASK_COMPUTE;
16874
  ggml_compute_forward(&params, node);
16875
  }
16876
+
16877
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16878
+ task_phase = GGML_TASK_FINALIZE;
16879
+ atomic_store(&state->shared->n_active, n_threads);
16880
+ atomic_store(&state->shared->node_task, task_phase);
16881
+ }
16882
+ else {
16883
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
16884
+ }
16885
  }
16886
 
16887
  return GGML_EXIT_SUCCESS;
 
16938
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16939
  if (ggml_compute_forward_mul_mat_use_blas(node)) {
16940
  if (node->src[0]->type != GGML_TYPE_F32) {
16941
+ // here we need memory for fully dequantized matrix from src0
16942
+ cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
16943
  }
16944
  } else
16945
  #endif
 
17093
  /*.n_threads =*/ n_threads,
17094
  /*.n_active =*/ n_threads,
17095
  /*.node_n =*/ -1,
17096
+ /*.node_task =*/ GGML_TASK_FINALIZE,
17097
  /*.abort_callback =*/ NULL,
17098
  /*.abort_callback_data =*/ NULL,
17099
  };