Spaces:
Sleeping
Sleeping
slaren
commited on
llama : run all KQV ops on the CPU with no KV offload (llama/5049)
Browse files- ggml-backend.c +20 -14
ggml-backend.c
CHANGED
|
@@ -1191,6 +1191,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1191 |
ggml_tallocr_t src_allocr = node_allocr(src);
|
| 1192 |
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
| 1193 |
if (src_allocr != node_allocr) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1194 |
// check if the input is already in the split
|
| 1195 |
bool found = false;
|
| 1196 |
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
|
@@ -1206,19 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|
| 1206 |
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
| 1207 |
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1208 |
}
|
| 1209 |
-
|
| 1210 |
-
// create a copy of the input in the split's backend
|
| 1211 |
-
size_t id = hash_id(src);
|
| 1212 |
-
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
| 1213 |
-
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
| 1214 |
-
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1215 |
-
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
| 1216 |
-
|
| 1217 |
-
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
| 1218 |
-
node_allocr(tensor_copy) = cur_allocr;
|
| 1219 |
-
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1220 |
-
}
|
| 1221 |
-
node->src[j] = sched->node_copies[id][cur_backend_id];
|
| 1222 |
}
|
| 1223 |
}
|
| 1224 |
}
|
|
@@ -1333,7 +1339,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|
| 1333 |
uint64_t compute_start_us = ggml_time_us();
|
| 1334 |
if (!sched->callback_eval) {
|
| 1335 |
ggml_backend_graph_compute(split_backend, &split->graph);
|
| 1336 |
-
|
| 1337 |
} else {
|
| 1338 |
// similar to ggml_backend_compare_graph_backend
|
| 1339 |
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
|
|
|
| 1191 |
ggml_tallocr_t src_allocr = node_allocr(src);
|
| 1192 |
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
| 1193 |
if (src_allocr != node_allocr) {
|
| 1194 |
+
// create a copy of the input in the split's backend
|
| 1195 |
+
size_t id = hash_id(src);
|
| 1196 |
+
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
| 1197 |
+
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
| 1198 |
+
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
| 1199 |
+
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
| 1200 |
+
|
| 1201 |
+
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
| 1202 |
+
node_allocr(tensor_copy) = cur_allocr;
|
| 1203 |
+
SET_CAUSE(tensor_copy, "4.cpy");
|
| 1204 |
+
|
| 1205 |
+
int n_inputs = sched->splits[cur_split].n_inputs++;
|
| 1206 |
+
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
| 1207 |
+
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1208 |
+
}
|
| 1209 |
+
node->src[j] = sched->node_copies[id][cur_backend_id];
|
| 1210 |
+
|
| 1211 |
+
#if 0
|
| 1212 |
// check if the input is already in the split
|
| 1213 |
bool found = false;
|
| 1214 |
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
|
|
|
| 1224 |
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
| 1225 |
sched->splits[cur_split].inputs[n_inputs] = src;
|
| 1226 |
}
|
| 1227 |
+
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1228 |
}
|
| 1229 |
}
|
| 1230 |
}
|
|
|
|
| 1339 |
uint64_t compute_start_us = ggml_time_us();
|
| 1340 |
if (!sched->callback_eval) {
|
| 1341 |
ggml_backend_graph_compute(split_backend, &split->graph);
|
| 1342 |
+
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
| 1343 |
} else {
|
| 1344 |
// similar to ggml_backend_compare_graph_backend
|
| 1345 |
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|