slaren commited on
Commit
97ce95c
·
unverified ·
1 Parent(s): 0935414

llama : run all KQV ops on the CPU with no KV offload (llama/5049)

Browse files
Files changed (1) hide show
  1. ggml-backend.c +20 -14
ggml-backend.c CHANGED
@@ -1191,6 +1191,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1191
  ggml_tallocr_t src_allocr = node_allocr(src);
1192
  GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1193
  if (src_allocr != node_allocr) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1194
  // check if the input is already in the split
1195
  bool found = false;
1196
  for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@@ -1206,19 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
1206
  GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1207
  sched->splits[cur_split].inputs[n_inputs] = src;
1208
  }
1209
-
1210
- // create a copy of the input in the split's backend
1211
- size_t id = hash_id(src);
1212
- if (sched->node_copies[id][cur_backend_id] == NULL) {
1213
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1214
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1215
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1216
-
1217
- sched->node_copies[id][cur_backend_id] = tensor_copy;
1218
- node_allocr(tensor_copy) = cur_allocr;
1219
- SET_CAUSE(tensor_copy, "4.cpy");
1220
- }
1221
- node->src[j] = sched->node_copies[id][cur_backend_id];
1222
  }
1223
  }
1224
  }
@@ -1333,7 +1339,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
1333
  uint64_t compute_start_us = ggml_time_us();
1334
  if (!sched->callback_eval) {
1335
  ggml_backend_graph_compute(split_backend, &split->graph);
1336
- //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1337
  } else {
1338
  // similar to ggml_backend_compare_graph_backend
1339
  for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
 
1191
  ggml_tallocr_t src_allocr = node_allocr(src);
1192
  GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
1193
  if (src_allocr != node_allocr) {
1194
+ // create a copy of the input in the split's backend
1195
+ size_t id = hash_id(src);
1196
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
1197
+ ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
1198
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
1199
+ ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
1200
+
1201
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
1202
+ node_allocr(tensor_copy) = cur_allocr;
1203
+ SET_CAUSE(tensor_copy, "4.cpy");
1204
+
1205
+ int n_inputs = sched->splits[cur_split].n_inputs++;
1206
+ GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1207
+ sched->splits[cur_split].inputs[n_inputs] = src;
1208
+ }
1209
+ node->src[j] = sched->node_copies[id][cur_backend_id];
1210
+
1211
+ #if 0
1212
  // check if the input is already in the split
1213
  bool found = false;
1214
  for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
 
1224
  GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
1225
  sched->splits[cur_split].inputs[n_inputs] = src;
1226
  }
1227
+ #endif
 
 
 
 
 
 
 
 
 
 
 
 
1228
  }
1229
  }
1230
  }
 
1339
  uint64_t compute_start_us = ggml_time_us();
1340
  if (!sched->callback_eval) {
1341
  ggml_backend_graph_compute(split_backend, &split->graph);
1342
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
1343
  } else {
1344
  // similar to ggml_backend_compare_graph_backend
1345
  for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {