ggerganov commited on
Commit
44cd2d4
·
unverified ·
1 Parent(s): 8ebb36c

src : relocate new backend sources

Browse files
Files changed (6) hide show
  1. ggml-kompute.cpp +1990 -0
  2. ggml-kompute.h +46 -0
  3. ggml-sycl.cpp +0 -0
  4. ggml-sycl.h +29 -0
  5. ggml-vulkan.cpp +0 -0
  6. ggml-vulkan.h +39 -0
ggml-kompute.cpp ADDED
@@ -0,0 +1,1990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+ #include "ggml-backend.h"
3
+ #include "ggml-backend-impl.h"
4
+ #include "ggml-kompute.h"
5
+
6
+ // These are generated at build time by cmake custom command
7
+ #include "shaderop_scale.h"
8
+ #include "shaderop_scale_8.h"
9
+ #include "shaderop_add.h"
10
+ #include "shaderop_addrow.h"
11
+ #include "shaderop_mul.h"
12
+ #include "shaderop_silu.h"
13
+ #include "shaderop_relu.h"
14
+ #include "shaderop_gelu.h"
15
+ #include "shaderop_softmax.h"
16
+ #include "shaderop_norm.h"
17
+ #include "shaderop_rmsnorm.h"
18
+ #include "shaderop_diagmask.h"
19
+ #include "shaderop_mul_mat_f16.h"
20
+ #include "shaderop_mul_mat_q8_0.h"
21
+ #include "shaderop_mul_mat_q4_0.h"
22
+ #include "shaderop_mul_mat_q4_1.h"
23
+ #include "shaderop_mul_mat_q6_k.h"
24
+ #include "shaderop_mul_mat_mat_f32.h"
25
+ #include "shaderop_getrows_f16.h"
26
+ #include "shaderop_getrows_q4_0.h"
27
+ #include "shaderop_getrows_q4_1.h"
28
+ #include "shaderop_getrows_q6_k.h"
29
+ #include "shaderop_rope_f16.h"
30
+ #include "shaderop_rope_f32.h"
31
+ #include "shaderop_cpy_f16_f16.h"
32
+ #include "shaderop_cpy_f16_f32.h"
33
+ #include "shaderop_cpy_f32_f16.h"
34
+ #include "shaderop_cpy_f32_f32.h"
35
+
36
+ #include <algorithm>
37
+ #include <array>
38
+ #include <cassert>
39
+ #include <cstdint>
40
+ #include <cstdio>
41
+ #include <cstring>
42
+ #include <iostream>
43
+ #include <memory>
44
+ #include <stdexcept>
45
+ #include <string>
46
+ #include <unordered_map>
47
+ #include <utility>
48
+ #include <vector>
49
+
50
+ #include <kompute/Kompute.hpp>
51
+ #include <vulkan/vulkan.hpp>
52
+
53
+ #ifdef __linux__
54
+ #include <cstdlib> // for setenv
55
+ #endif
56
+
57
+ #define QK4_0 32
58
+ #define QR4_0 2
59
+ #define QK4_1 32
60
+ #define QK_NL 16
61
+
62
+ typedef ggml_fp16_t half;
63
+
64
+ static std::string ggml_kompute_format_name(int device) {
65
+ return "Kompute" + std::to_string(device);
66
+ }
67
+
68
+ struct ggml_kompute_context {
69
+ int device;
70
+ std::string name;
71
+ std::shared_ptr<vk::DescriptorPool> pool;
72
+
73
+ ggml_kompute_context(int device)
74
+ : device(device), name(ggml_kompute_format_name(device)) {}
75
+ };
76
+
77
+ // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
78
+ // and consolidate the init functions and simplify object lifetime management. As it currently stands,
79
+ // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
80
+ // is only created when a device is set and vulkan is explicitly turned on.
81
+ static ggml_kompute_context *s_kompute_context = nullptr;
82
+
83
+ class kompute_manager {
84
+ kp::Manager *s_mgr = nullptr;
85
+
86
+ public:
87
+ kp::Manager *operator()() {
88
+ if (s_mgr && !s_mgr->hasInstance()) {
89
+ destroy();
90
+ }
91
+ if (!s_mgr) {
92
+ s_mgr = new kp::Manager;
93
+ }
94
+ return s_mgr;
95
+ }
96
+
97
+ void destroy() {
98
+ delete s_mgr;
99
+ s_mgr = nullptr;
100
+ }
101
+ };
102
+
103
+ static kompute_manager komputeManager;
104
+
105
+ struct ggml_vk_memory {
106
+ void *data = nullptr;
107
+ size_t size = 0;
108
+ vk::DeviceMemory *primaryMemory = nullptr;
109
+ vk::Buffer *primaryBuffer = nullptr;
110
+ vk::DeviceMemory *stagingMemory = nullptr;
111
+ vk::Buffer *stagingBuffer = nullptr;
112
+ };
113
+
114
+ #ifdef __linux__
115
+ __attribute__((constructor))
116
+ static void enable_sam() {
117
+ setenv("RADV_PERFTEST", "sam", false);
118
+ }
119
+ #endif
120
+
121
+ static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physical_device) {
122
+ vk::PhysicalDeviceFeatures availableFeatures;
123
+ physical_device.getFeatures(&availableFeatures);
124
+
125
+ if (!availableFeatures.shaderInt16)
126
+ return false;
127
+
128
+ vk::PhysicalDeviceVulkan11Features availableFeatures11;
129
+ vk::PhysicalDeviceVulkan12Features availableFeatures12;
130
+
131
+ availableFeatures11.pNext = &availableFeatures12;
132
+ availableFeatures12.pNext = nullptr;
133
+
134
+ vk::PhysicalDeviceFeatures2 features2;
135
+ features2.pNext = &availableFeatures11;
136
+
137
+ physical_device.getFeatures2(&features2);
138
+
139
+ if (!availableFeatures11.uniformAndStorageBuffer16BitAccess ||
140
+ !availableFeatures11.storageBuffer16BitAccess) {
141
+ return false;
142
+ }
143
+
144
+ if (!availableFeatures12.storageBuffer8BitAccess ||
145
+ !availableFeatures12.uniformAndStorageBuffer8BitAccess ||
146
+ !availableFeatures12.shaderFloat16 ||
147
+ !availableFeatures12.shaderInt8) {
148
+ return false;
149
+ }
150
+
151
+ return true;
152
+ }
153
+
154
+ static const char * ggml_vk_getVendorName(uint32_t vendorID) {
155
+ switch (vendorID) {
156
+ case 0x10DE:
157
+ return "nvidia";
158
+ case 0x1002:
159
+ return "amd";
160
+ case 0x8086:
161
+ return "intel";
162
+ default:
163
+ return "unknown";
164
+ }
165
+ }
166
+
167
+ static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t memoryRequired) {
168
+ std::vector<ggml_vk_device> results;
169
+ if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
170
+ return results;
171
+
172
+ std::vector<vk::PhysicalDevice> physical_devices;
173
+ try {
174
+ physical_devices = komputeManager()->listDevices();
175
+ } catch (vk::SystemError & err) {
176
+ std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n";
177
+ return results;
178
+ }
179
+
180
+ uint32_t deviceCount = physical_devices.size();
181
+ if (deviceCount == 0)
182
+ return results;
183
+
184
+ std::unordered_map<std::string, size_t> count_by_name;
185
+
186
+ for (uint32_t i = 0; i < deviceCount; i++) {
187
+ const auto & physical_device = physical_devices[i];
188
+
189
+ VkPhysicalDeviceProperties dev_props = physical_device.getProperties();
190
+ VkPhysicalDeviceMemoryProperties memoryProperties = physical_device.getMemoryProperties();
191
+ const uint32_t major = VK_VERSION_MAJOR(dev_props.apiVersion);
192
+ const uint32_t minor = VK_VERSION_MINOR(dev_props.apiVersion);
193
+ if (major < 1 || minor < 2)
194
+ continue;
195
+
196
+ if (!ggml_vk_checkPhysicalDeviceFeatures(physical_device))
197
+ continue;
198
+
199
+ size_t heapSize = 0;
200
+ for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) {
201
+ VkMemoryHeap heap = memoryProperties.memoryHeaps[j];
202
+ if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
203
+ heapSize = heap.size;
204
+ break;
205
+ }
206
+ }
207
+
208
+ if (heapSize < memoryRequired)
209
+ continue;
210
+
211
+ auto ext_props = physical_device.enumerateDeviceExtensionProperties();
212
+ bool has_maintenance4 = false;
213
+
214
+ // Check if maintenance4 is supported
215
+ for (const auto & properties : ext_props) {
216
+ if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
217
+ has_maintenance4 = true;
218
+ }
219
+ }
220
+
221
+ vk::PhysicalDeviceSubgroupProperties subgroup_props;
222
+ vk::PhysicalDeviceProperties2 dev_props2;
223
+ vk::PhysicalDeviceMaintenance3Properties dev_props3;
224
+ vk::PhysicalDeviceMaintenance4Properties dev_props4;
225
+ dev_props2.pNext = &dev_props3;
226
+ dev_props3.pNext = &subgroup_props;
227
+ if (has_maintenance4) {
228
+ subgroup_props.pNext = &dev_props4;
229
+ }
230
+ physical_device.getProperties2(&dev_props2);
231
+
232
+ if (subgroup_props.subgroupSize < 32)
233
+ continue;
234
+
235
+ ggml_vk_device d;
236
+ d.index = i;
237
+ d.type = dev_props.deviceType;
238
+ d.heapSize = heapSize;
239
+ d.vendor = strdup(ggml_vk_getVendorName(dev_props.vendorID));
240
+ d.subgroupSize = subgroup_props.subgroupSize;
241
+ d.bufferAlignment = dev_props.limits.minStorageBufferOffsetAlignment;
242
+
243
+ if (has_maintenance4) {
244
+ d.maxAlloc = std::min(dev_props3.maxMemoryAllocationSize, dev_props4.maxBufferSize);
245
+ } else {
246
+ d.maxAlloc = dev_props3.maxMemoryAllocationSize;
247
+ }
248
+
249
+ std::string name(dev_props.deviceName);
250
+ size_t n_idx = ++count_by_name[name];
251
+ if (n_idx > 1) {
252
+ name += " (" + std::to_string(n_idx) + ")";
253
+ }
254
+ d.name = strdup(name.c_str());
255
+
256
+ results.push_back(d);
257
+ }
258
+
259
+ std::stable_sort(results.begin(), results.end(),
260
+ [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool {
261
+ if (lhs.type != rhs.type) {
262
+ if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true;
263
+ if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false;
264
+
265
+ if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true;
266
+ if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false;
267
+ }
268
+ return lhs.heapSize < rhs.heapSize;
269
+ }
270
+ );
271
+
272
+ return results;
273
+ }
274
+
275
+ // public API returns a C-style array
276
+ ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count) {
277
+ auto devices = ggml_vk_available_devices_internal(memoryRequired);
278
+ *count = devices.size();
279
+ if (devices.empty()) {
280
+ return nullptr;
281
+ }
282
+
283
+ size_t nbytes = sizeof (ggml_vk_device) * (devices.size());
284
+ auto * arr = static_cast<ggml_vk_device *>(malloc(nbytes));
285
+ memcpy(arr, devices.data(), nbytes);
286
+ return arr;
287
+ }
288
+
289
+ static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
290
+ devices.erase(
291
+ std::remove_if(devices.begin(), devices.end(),
292
+ [&targetVendor](const ggml_vk_device& device) {
293
+ return device.vendor != targetVendor;
294
+ }),
295
+ devices.end()
296
+ );
297
+ }
298
+
299
+ static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std::string& targetName) {
300
+ devices.erase(
301
+ std::remove_if(devices.begin(), devices.end(),
302
+ [&targetName](const ggml_vk_device& device) {
303
+ return device.name != targetName;
304
+ }),
305
+ devices.end()
306
+ );
307
+ }
308
+
309
+ static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) {
310
+ if (name.empty())
311
+ return false;
312
+
313
+ auto devices = ggml_vk_available_devices_internal(memoryRequired);
314
+ if (name == "amd" || name == "nvidia" || name == "intel") {
315
+ ggml_vk_filterByVendor(devices, name);
316
+ } else if (name != "gpu") {
317
+ ggml_vk_filterByName(devices, name);
318
+ }
319
+
320
+ if (devices.empty())
321
+ return false;
322
+
323
+ *device = devices.front();
324
+ return true;
325
+ }
326
+
327
+ bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) {
328
+ return ggml_vk_get_device(device, memoryRequired, std::string(name));
329
+ }
330
+
331
+ bool ggml_vk_has_vulkan() {
332
+ return komputeManager()->hasVulkan();
333
+ }
334
+
335
+ bool ggml_vk_has_device() {
336
+ return komputeManager()->hasDevice();
337
+ }
338
+
339
+ ggml_vk_device ggml_vk_current_device() {
340
+ if (!komputeManager()->hasDevice())
341
+ return ggml_vk_device();
342
+
343
+ auto devices = ggml_vk_available_devices_internal(0);
344
+ ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
345
+ GGML_ASSERT(!devices.empty());
346
+ return devices.front();
347
+ }
348
+
349
+ static
350
+ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
351
+ std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
352
+ vk::DescriptorPoolSize(
353
+ vk::DescriptorType::eStorageBuffer,
354
+ 3 * size // Descriptor count is number of possible tensors to pass into an algorithm
355
+ )
356
+ };
357
+
358
+ vk::DescriptorPoolCreateInfo descriptorPoolInfo(
359
+ vk::DescriptorPoolCreateFlags(),
360
+ size, // Max sets
361
+ static_cast<uint32_t>(descriptorPoolSizes.size()),
362
+ descriptorPoolSizes.data());
363
+
364
+ ctx->pool = std::make_shared<vk::DescriptorPool>();
365
+ vk::Result r = komputeManager()->device()->createDescriptorPool(
366
+ &descriptorPoolInfo, nullptr, ctx->pool.get());
367
+ if (r != vk::Result::eSuccess)
368
+ std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
369
+ }
370
+
371
+ static
372
+ void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
373
+ if (ctx->pool) {
374
+ komputeManager()->device()->destroy(
375
+ *ctx->pool,
376
+ (vk::Optional<const vk::AllocationCallbacks>)nullptr);
377
+ ctx->pool = nullptr;
378
+ }
379
+ }
380
+
381
+ static
382
+ vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
383
+ vk::BufferCreateInfo bufferCreateInfo;
384
+ bufferCreateInfo.size = size;
385
+ bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer |
386
+ vk::BufferUsageFlagBits::eTransferSrc |
387
+ vk::BufferUsageFlagBits::eTransferDst;
388
+ bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
389
+
390
+ vk::Buffer *vkBuffer = new vk::Buffer;
391
+ vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
392
+ if (r != vk::Result::eSuccess)
393
+ std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl;
394
+ return vkBuffer;
395
+ }
396
+
397
+ static
398
+ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
399
+
400
+ uint32_t memoryTypeIndex = -1;
401
+ bool memoryTypeIndexFound = false;
402
+ vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
403
+ for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
404
+ const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i];
405
+ const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex];
406
+ if (memoryHeap.size < size) {
407
+ continue;
408
+ }
409
+
410
+ if (requirements.memoryTypeBits & (1 << i)) {
411
+ if (((memoryProperties.memoryTypes[i]).propertyFlags &
412
+ flags) == flags) {
413
+ memoryTypeIndex = i;
414
+ memoryTypeIndexFound = true;
415
+ if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) {
416
+ *isHostVisible = true;
417
+ }
418
+ break;
419
+ }
420
+ }
421
+ }
422
+ if (!memoryTypeIndexFound) {
423
+ throw std::runtime_error(
424
+ "Memory type index for buffer creation not found");
425
+ }
426
+
427
+ vk::MemoryAllocateInfo allocInfo;
428
+ allocInfo.allocationSize = size;
429
+ allocInfo.memoryTypeIndex = memoryTypeIndex;
430
+ vk::DeviceMemory *vkDeviceMemory = new vk::DeviceMemory;
431
+ vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
432
+ if (r != vk::Result::eSuccess) {
433
+ std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl;
434
+ throw std::runtime_error("Error allocating vulkan memory.");
435
+ }
436
+ return vkDeviceMemory;
437
+ }
438
+
439
+ static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) {
440
+ size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer);
441
+
442
+ // If offset is already aligned, return it directly
443
+ if (offset % minStorageBufferOffsetAlignment == 0) {
444
+ return offset;
445
+ }
446
+
447
+ // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset
448
+ return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
449
+ }
450
+
451
+ static ggml_vk_memory ggml_vk_allocate(size_t size) {
452
+ ggml_vk_memory memory;
453
+ bool isHostVisible = false;
454
+ {
455
+ memory.primaryBuffer = ggml_vk_allocate_buffer(size);
456
+ vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer);
457
+ vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
458
+ memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
459
+ komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
460
+ if (isHostVisible) {
461
+ vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
462
+ if (r != vk::Result::eSuccess)
463
+ std::cerr << "Error mapping memory" << vk::to_string(r);
464
+ }
465
+ }
466
+
467
+ if (!isHostVisible) {
468
+ memory.stagingBuffer = ggml_vk_allocate_buffer(size);
469
+ vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer);
470
+ vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
471
+ vk::MemoryPropertyFlagBits::eHostCoherent |
472
+ vk::MemoryPropertyFlagBits::eHostCached;
473
+ memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
474
+ komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
475
+ vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
476
+ if (r != vk::Result::eSuccess)
477
+ std::cerr << "Error mapping memory" << vk::to_string(r);
478
+ }
479
+
480
+ memory.size = size;
481
+ return memory;
482
+ }
483
+
484
+ static void ggml_vk_free_memory(ggml_vk_memory &memory)
485
+ {
486
+ komputeManager()->device()->destroy(
487
+ *memory.primaryBuffer,
488
+ (vk::Optional<const vk::AllocationCallbacks>)nullptr);
489
+ if (memory.stagingBuffer) {
490
+ komputeManager()->device()->destroy(
491
+ *memory.stagingBuffer,
492
+ (vk::Optional<const vk::AllocationCallbacks>)nullptr);
493
+ }
494
+ komputeManager()->device()->freeMemory(
495
+ *memory.primaryMemory,
496
+ (vk::Optional<const vk::AllocationCallbacks>)nullptr);
497
+ if (memory.stagingMemory) {
498
+ komputeManager()->device()->freeMemory(
499
+ *memory.stagingMemory,
500
+ (vk::Optional<const vk::AllocationCallbacks>)nullptr);
501
+ }
502
+ }
503
+
504
+ static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft);
505
+
506
+ static
507
+ ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
508
+ ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
509
+
510
+ // compatibility with ggml-backend
511
+ GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name);
512
+
513
+ ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context);
514
+
515
+ const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data);
516
+
517
+ GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size));
518
+
519
+ offset = uint64_t(ioffs);
520
+ return buf_ctx;
521
+ }
522
+
523
+ static
524
+ const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
525
+ uint64_t originalOffset = 0;
526
+ auto * res = ggml_vk_find_tensor(t, originalOffset);
527
+ if (!res) {
528
+ static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
529
+ return nullTensor;
530
+ }
531
+
532
+ // Create a tensor whose memory will be composed of our buffers at the correct offset
533
+ const size_t nelements = ggml_nelements(t);
534
+ size_t nbytes = ggml_nbytes(t);
535
+
536
+ size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset);
537
+ if (alignedOffset) {
538
+ *alignedOffset = originalOffset - vulkanOffset;
539
+ nbytes += *alignedOffset;
540
+ }
541
+
542
+ return komputeManager()->tensor(
543
+ t->data,
544
+ nelements,
545
+ nbytes, kp::Tensor::TensorDataTypes::eFloat,
546
+ res->primaryMemory, res->primaryBuffer,
547
+ res->stagingMemory, res->stagingBuffer,
548
+ vulkanOffset);
549
+ }
550
+
551
+ static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
552
+ if (size % sizeof(uint32_t) != 0) {
553
+ throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
554
+ }
555
+
556
+ const uint32_t* data_ptr = reinterpret_cast<const uint32_t*>(rawData);
557
+ size_t count = size / sizeof(uint32_t);
558
+ return std::vector<uint32_t>(data_ptr, data_ptr + count);
559
+ }
560
+
561
+ inline static
562
+ uint32_t safe_divide(uint32_t a, uint32_t b) {
563
+ if (b <= 1) {
564
+ return a;
565
+ }
566
+ if ((a % b) != 0) {
567
+ fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
568
+ GGML_ASSERT(!"safe_divide result would've had remainder");
569
+ }
570
+ return a / b;
571
+ }
572
+
573
+ static void ggml_vk_add(
574
+ kp::Sequence& seq,
575
+ const std::shared_ptr<kp::Tensor>& inA,
576
+ const std::shared_ptr<kp::Tensor>& inB,
577
+ const std::shared_ptr<kp::Tensor>& out,
578
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
579
+ int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
580
+ int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
581
+ int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
582
+ int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
583
+ int32_t ne0,
584
+ int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3
585
+ ) {
586
+ const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
587
+ kp::shader_data::op_add_comp_spv_len);
588
+
589
+ struct PushConstants {
590
+ uint32_t inAOff, inBOff, outOff;
591
+ int32_t ne00;
592
+ int32_t nb00, nb01, nb02, nb03;
593
+ int32_t ne10, ne11, ne12, ne13;
594
+ int32_t nb10, nb11, nb12, nb13;
595
+ int32_t ne0;
596
+ int32_t nb0, nb1, nb2, nb3;
597
+ } const pushConsts {
598
+ safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
599
+ ne00,
600
+ nb00, nb01, nb02, nb03,
601
+ ne10, ne11, ne12, ne13,
602
+ nb10, nb11, nb12, nb13,
603
+ ne0,
604
+ nb0, nb1, nb2, nb3
605
+ };
606
+
607
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
608
+ if (!komputeManager()->hasAlgorithm(__func__)) {
609
+ s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
610
+ } else {
611
+ s_algo = komputeManager()->getAlgorithm(__func__);
612
+ s_algo->setTensors({inA, inB, out});
613
+ s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
614
+ s_algo->setPushConstants<PushConstants>({pushConsts});
615
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
616
+ }
617
+ seq.record<kp::OpAlgoDispatch>(s_algo);
618
+ }
619
+
620
+ static void ggml_vk_addrow(kp::Sequence& seq,
621
+ const std::shared_ptr<kp::Tensor>& inA,
622
+ const std::shared_ptr<kp::Tensor>& inB,
623
+ const std::shared_ptr<kp::Tensor>& out,
624
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
625
+ uint32_t size, uint32_t row = 0) {
626
+
627
+ const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv,
628
+ kp::shader_data::op_addrow_comp_spv_len);
629
+
630
+ struct PushConstants {
631
+ uint32_t inAOff, inBOff, outOff;
632
+ uint32_t row;
633
+ } const pushConsts {
634
+ safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
635
+ row
636
+ };
637
+
638
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
639
+ if (!komputeManager()->hasAlgorithm(__func__))
640
+ s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
641
+ else {
642
+ s_algo = komputeManager()->getAlgorithm(__func__);
643
+ s_algo->setTensors({inA, inB, out});
644
+ s_algo->setWorkgroup({size});
645
+ s_algo->setPushConstants<PushConstants>({pushConsts});
646
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
647
+ }
648
+ seq.record<kp::OpAlgoDispatch>(s_algo);
649
+ }
650
+
651
+ static void ggml_vk_mul(
652
+ kp::Sequence& seq,
653
+ const std::shared_ptr<kp::Tensor>& inA,
654
+ const std::shared_ptr<kp::Tensor>& inB,
655
+ const std::shared_ptr<kp::Tensor>& out,
656
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
657
+ int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
658
+ int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
659
+ int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
660
+ int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
661
+ int32_t ne0,
662
+ int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3
663
+ ) {
664
+ const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv,
665
+ kp::shader_data::op_mul_comp_spv_len);
666
+
667
+ struct PushConstants {
668
+ uint32_t inAOff, inBOff, outOff;
669
+ int32_t ne00;
670
+ int32_t nb00, nb01, nb02, nb03;
671
+ int32_t ne10, ne11, ne12, ne13;
672
+ int32_t nb10, nb11, nb12, nb13;
673
+ int32_t ne0;
674
+ int32_t nb0, nb1, nb2, nb3;
675
+ } const pushConsts {
676
+ safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
677
+ ne00,
678
+ nb00, nb01, nb02, nb03,
679
+ ne10, ne11, ne12, ne13,
680
+ nb10, nb11, nb12, nb13,
681
+ ne0,
682
+ nb0, nb1, nb2, nb3
683
+ };
684
+
685
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
686
+ if (!komputeManager()->hasAlgorithm(__func__)) {
687
+ s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
688
+ } else {
689
+ s_algo = komputeManager()->getAlgorithm(__func__);
690
+ s_algo->setTensors({inA, inB, out});
691
+ s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
692
+ s_algo->setPushConstants<PushConstants>({pushConsts});
693
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
694
+ }
695
+ seq.record<kp::OpAlgoDispatch>(s_algo);
696
+ }
697
+
698
+ static void ggml_vk_scale(kp::Sequence& seq,
699
+ const std::shared_ptr<kp::Tensor>& in,
700
+ const std::shared_ptr<kp::Tensor>& out,
701
+ uint32_t inOff, uint32_t outOff,
702
+ uint32_t size, float scale) {
703
+ const static auto spirv_1 = getSpirvShader(
704
+ kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len
705
+ );
706
+ const static auto spirv_8 = getSpirvShader(
707
+ kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len
708
+ );
709
+
710
+ struct PushConstants {
711
+ uint32_t inOff, outOff;
712
+ float scale;
713
+ } const pushConsts {
714
+ safe_divide(inOff, 4), safe_divide(outOff, 4),
715
+ scale
716
+ };
717
+
718
+ const auto * spirv = &spirv_1;
719
+ std::string name(__func__);
720
+ if (size % 8 == 0) {
721
+ size /= 8;
722
+ name += "_8";
723
+ spirv = &spirv_8;
724
+ }
725
+
726
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
727
+ if (!komputeManager()->hasAlgorithm(name)) {
728
+ s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
729
+ } else {
730
+ s_algo = komputeManager()->getAlgorithm(name);
731
+ s_algo->setTensors({in, out});
732
+ s_algo->setWorkgroup({size});
733
+ s_algo->setPushConstants<PushConstants>({pushConsts});
734
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
735
+ }
736
+ seq.record<kp::OpAlgoDispatch>(s_algo);
737
+ }
738
+
739
+ static void ggml_vk_xxlu(
740
+ const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
741
+ const std::shared_ptr<kp::Tensor>& in,
742
+ const std::shared_ptr<kp::Tensor>& out,
743
+ uint32_t inOff, uint32_t outOff,
744
+ uint32_t size
745
+ ) {
746
+ struct PushConstants {
747
+ uint32_t inOff, outOff;
748
+ } const pushConsts {
749
+ safe_divide(inOff, 4), safe_divide(outOff, 4),
750
+ };
751
+
752
+ auto name = std::string(__func__) + "_" + suffix;
753
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
754
+ if (!komputeManager()->hasAlgorithm(name)) {
755
+ s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
756
+ } else {
757
+ s_algo = komputeManager()->getAlgorithm(name);
758
+ s_algo->setTensors({in, out});
759
+ s_algo->setWorkgroup({size});
760
+ s_algo->setPushConstants<PushConstants>({pushConsts});
761
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
762
+ }
763
+ seq.record<kp::OpAlgoDispatch>(s_algo);
764
+ }
765
+
766
+ template <typename... Args>
767
+ static void ggml_vk_silu(Args&&... args) {
768
+ const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
769
+ kp::shader_data::op_silu_comp_spv_len);
770
+
771
+ ggml_vk_xxlu(spirv, "silu", std::forward<Args>(args)...);
772
+ }
773
+
774
+ template <typename... Args>
775
+ static void ggml_vk_relu(Args&&... args) {
776
+ const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
777
+ kp::shader_data::op_relu_comp_spv_len);
778
+
779
+ ggml_vk_xxlu(spirv, "relu", std::forward<Args>(args)...);
780
+ }
781
+
782
+ template <typename... Args>
783
+ static void ggml_vk_gelu(Args&&... args) {
784
+ const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
785
+ kp::shader_data::op_gelu_comp_spv_len);
786
+
787
+ ggml_vk_xxlu(spirv, "gelu", std::forward<Args>(args)...);
788
+ }
789
+
790
+ static void ggml_vk_soft_max(
791
+ kp::Sequence& seq,
792
+ const std::shared_ptr<kp::Tensor>& inA,
793
+ const std::shared_ptr<kp::Tensor>& inB,
794
+ const std::shared_ptr<kp::Tensor>& out,
795
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
796
+ int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
797
+ float scale
798
+ ) {
799
+ const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
800
+ kp::shader_data::op_softmax_comp_spv_len);
801
+
802
+ struct PushConstants {
803
+ uint32_t inAOff, inBOff, outOff;
804
+ int32_t ne00, ne01, ne02;
805
+ float scale;
806
+ int32_t mask;
807
+ } pushConsts {
808
+ safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
809
+ ne00, ne01, ne02,
810
+ scale,
811
+ bool(inB)
812
+ };
813
+
814
+ auto & inB_ = inB ? inB : inA;
815
+
816
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
817
+ if (!komputeManager()->hasAlgorithm(__func__)) {
818
+ // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
819
+ const uint32_t local_x = 32;
820
+ s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
821
+ } else {
822
+ s_algo = komputeManager()->getAlgorithm(__func__);
823
+ s_algo->setTensors({inA, inB_, out});
824
+ s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
825
+ s_algo->setPushConstants<PushConstants>({pushConsts});
826
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
827
+ }
828
+ seq.record<kp::OpAlgoDispatch>(s_algo);
829
+ }
830
+
831
+ static void ggml_vk_norm_(
832
+ const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
833
+ const std::shared_ptr<kp::Tensor>& in,
834
+ const std::shared_ptr<kp::Tensor>& out,
835
+ uint32_t inOff, uint32_t outOff,
836
+ int32_t ne00, int32_t nb01,
837
+ int32_t nrows, float epsilon
838
+ ) {
839
+ GGML_ASSERT(nb01%sizeof(float) == 0);
840
+ GGML_ASSERT(ne00%sizeof(float) == 0);
841
+
842
+ struct PushConstants {
843
+ uint32_t inOff, outOff;
844
+ uint32_t ne00, nb01;
845
+ float eps;
846
+ } pushConsts {
847
+ safe_divide(inOff, 4), safe_divide(outOff, 4),
848
+ (uint32_t)ne00, (uint32_t)nb01, epsilon
849
+ };
850
+
851
+ auto name = std::string(__func__) + "_" + suffix;
852
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
853
+ if (!komputeManager()->hasAlgorithm(name)) {
854
+ s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
855
+ } else {
856
+ s_algo = komputeManager()->getAlgorithm(name);
857
+ s_algo->setTensors({in, out});
858
+ s_algo->setWorkgroup({(uint32_t)nrows});
859
+ s_algo->setPushConstants<PushConstants>({pushConsts});
860
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
861
+ }
862
+ seq.record<kp::OpAlgoDispatch>(s_algo);
863
+ }
864
+
865
+ template <typename... Args>
866
+ static void ggml_vk_norm(Args&&... args) {
867
+ const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
868
+ kp::shader_data::op_norm_comp_spv_len);
869
+
870
+ ggml_vk_norm_(spirv, "norm", std::forward<Args>(args)...);
871
+ }
872
+
873
+ template <typename... Args>
874
+ static void ggml_vk_rms_norm(Args&&... args) {
875
+ const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
876
+ kp::shader_data::op_rmsnorm_comp_spv_len);
877
+
878
+ ggml_vk_norm_(spirv, "rms", std::forward<Args>(args)...);
879
+ }
880
+
881
+ static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
882
+ const std::shared_ptr<kp::Tensor>& in,
883
+ const std::shared_ptr<kp::Tensor>& out,
884
+ uint32_t inOff, uint32_t outOff,
885
+ uint32_t n_past,
886
+ int32_t ne00, int32_t ne01, int32_t ne02) {
887
+ const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv,
888
+ kp::shader_data::op_diagmask_comp_spv_len);
889
+
890
+ struct PushConstants {
891
+ uint32_t inOff, outOff;
892
+ uint32_t n_past;
893
+ int32_t ne00, ne01;
894
+ } pushConsts {
895
+ safe_divide(inOff, 4), safe_divide(outOff, 4),
896
+ n_past,
897
+ ne00, ne01
898
+ };
899
+
900
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
901
+ if (!komputeManager()->hasAlgorithm(__func__))
902
+ s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
903
+ else {
904
+ s_algo = komputeManager()->getAlgorithm(__func__);
905
+ s_algo->setTensors({in, out});
906
+ s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
907
+ s_algo->setPushConstants<PushConstants>({pushConsts});
908
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
909
+ }
910
+ seq.record<kp::OpAlgoDispatch>(s_algo);
911
+ }
912
+
913
+ static void ggml_vk_mul_mat_f16(
914
+ kp::Sequence& seq,
915
+ const std::shared_ptr<kp::Tensor>& inA,
916
+ const std::shared_ptr<kp::Tensor>& inB,
917
+ const std::shared_ptr<kp::Tensor>& out,
918
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
919
+ int32_t ne00, int32_t ne01, int32_t ne02,
920
+ uint32_t nb00, uint32_t nb01, uint32_t nb02,
921
+ int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
922
+ uint32_t nb10, uint32_t nb11, uint32_t nb12,
923
+ int32_t ne0, int32_t ne1,
924
+ uint32_t r2, uint32_t r3
925
+ ) {
926
+ const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv,
927
+ kp::shader_data::op_mul_mat_f16_comp_spv_len);
928
+
929
+ struct PushConstants {
930
+ uint32_t inAOff, inBOff, outOff;
931
+ int32_t ne00, ne01, ne02;
932
+ uint32_t nb00, nb01, nb02;
933
+ int32_t ne10, ne11, ne12;
934
+ uint32_t nb10, nb11, nb12;
935
+ int32_t ne0, ne1;
936
+ uint32_t r2, r3;
937
+ } pushConsts {
938
+ safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
939
+ ne00, ne01, ne02,
940
+ nb00, nb01, nb02,
941
+ ne10, ne11, ne12,
942
+ nb10, nb11, nb12,
943
+ ne0, ne1,
944
+ r2, r3
945
+ };
946
+
947
+ const unsigned ny = unsigned((ne11 + 4 - 1)/4);
948
+
949
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
950
+ if (!komputeManager()->hasAlgorithm(__func__)) {
951
+ const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
952
+ s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts});
953
+ } else {
954
+ s_algo = komputeManager()->getAlgorithm(__func__);
955
+ s_algo->setTensors({inA, inB, out});
956
+ s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)});
957
+ s_algo->setPushConstants<PushConstants>({pushConsts});
958
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
959
+ }
960
+ seq.record<kp::OpAlgoDispatch>(s_algo);
961
+ }
962
+
963
+ static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
964
+ const std::shared_ptr<kp::Tensor>& inA,
965
+ const std::shared_ptr<kp::Tensor>& inB,
966
+ const std::shared_ptr<kp::Tensor>& out,
967
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
968
+ int32_t ne00, int32_t ne01, int32_t ne02,
969
+ uint32_t nb01, uint32_t nb02,
970
+ int32_t ne11, int32_t ne12,
971
+ uint32_t nb11, uint32_t nb12,
972
+ uint32_t nb1, uint32_t nb2) {
973
+ const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv,
974
+ kp::shader_data::op_mul_mat_mat_f32_comp_spv_len);
975
+
976
+ struct PushConstants {
977
+ uint32_t inAOff, inBOff, outOff;
978
+ int32_t ne00, ne01, ne02, ne11, ne12;
979
+ uint32_t nb01, nb02;
980
+ uint32_t nb11, nb12;
981
+ uint32_t nb1, nb2;
982
+ } pushConsts {
983
+ safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
984
+ ne00, ne01, ne02, ne11, ne12,
985
+ nb01, nb02, nb11, nb12,
986
+ nb1, nb2
987
+ };
988
+
989
+ const uint32_t local_x = ggml_vk_current_device().subgroupSize;
990
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
991
+ if (!komputeManager()->hasAlgorithm(__func__)) {
992
+ s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
993
+ {inA, inB, out}, spirv,
994
+ {unsigned(ne01),
995
+ unsigned(ne11),
996
+ unsigned(std::max(ne12, ne02))
997
+ },
998
+ {local_x},
999
+ {pushConsts});
1000
+ } else {
1001
+ s_algo = komputeManager()->getAlgorithm(__func__);
1002
+ s_algo->setTensors({inA, inB, out});
1003
+ s_algo->setWorkgroup({unsigned(ne01),
1004
+ unsigned(ne11),
1005
+ unsigned(std::max(ne12, ne02)),
1006
+ });
1007
+ s_algo->setPushConstants<PushConstants>({pushConsts});
1008
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
1009
+ }
1010
+ seq.record<kp::OpAlgoDispatch>(s_algo);
1011
+ }
1012
+
1013
+ static void ggml_vk_mul_mat_impl(
1014
+ const std::vector<uint32_t>& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq,
1015
+ const std::shared_ptr<kp::Tensor>& inA,
1016
+ const std::shared_ptr<kp::Tensor>& inB,
1017
+ const std::shared_ptr<kp::Tensor>& out,
1018
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1019
+ int32_t ne00, int32_t ne01, int32_t ne02,
1020
+ int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
1021
+ int32_t ne0, int32_t ne1,
1022
+ uint32_t r2, uint32_t r3
1023
+ ) {
1024
+ struct PushConstants {
1025
+ uint32_t inAOff, inBOff, outOff;
1026
+ int32_t ne00, ne01, ne02;
1027
+ int32_t ne10, ne12;
1028
+ int32_t ne0, ne1;
1029
+ uint32_t r2, r3;
1030
+ } pushConsts {
1031
+ safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
1032
+ ne00, ne01, ne02,
1033
+ ne10, ne12,
1034
+ ne0, ne1,
1035
+ r2, r3
1036
+ };
1037
+
1038
+ auto name = std::string(__func__) + "_" + suffix;
1039
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1040
+ if (!komputeManager()->hasAlgorithm(name)) {
1041
+ const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
1042
+ s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
1043
+ } else {
1044
+ s_algo = komputeManager()->getAlgorithm(name);
1045
+ s_algo->setTensors({inA, inB, out});
1046
+ s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)});
1047
+ s_algo->setPushConstants<PushConstants>({pushConsts});
1048
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
1049
+ }
1050
+ seq.record<kp::OpAlgoDispatch>(s_algo);
1051
+ }
1052
+
1053
+ template <typename... Args>
1054
+ static void ggml_vk_mul_mat_q4_0(Args&&... args) {
1055
+ const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
1056
+ kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
1057
+
1058
+ ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
1059
+ }
1060
+
1061
+ template <typename... Args>
1062
+ static void ggml_vk_mul_mat_q4_1(Args&&... args) {
1063
+ const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
1064
+ kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
1065
+
1066
+ ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
1067
+ }
1068
+
1069
+ template <typename... Args>
1070
+ static void ggml_vk_mul_mat_q8_0(Args&&... args) {
1071
+ const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
1072
+ kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
1073
+
1074
+ ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
1075
+ }
1076
+
1077
+ static void ggml_vk_mul_mat_q6_k(
1078
+ kp::Sequence& seq,
1079
+ const std::shared_ptr<kp::Tensor>& inA,
1080
+ const std::shared_ptr<kp::Tensor>& inB,
1081
+ const std::shared_ptr<kp::Tensor>& out,
1082
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1083
+ int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
1084
+ int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
1085
+ ) {
1086
+ const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
1087
+ kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
1088
+
1089
+ struct PushConstants {
1090
+ uint32_t inAOff, inBOff, outOff;
1091
+ int32_t ne00, ne10, ne0, ne1, ne01, gqa;
1092
+ } pushConsts {
1093
+ inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
1094
+ ne00, ne10, ne0, ne1, ne01, ne12/ne02
1095
+ };
1096
+
1097
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1098
+ if (!komputeManager()->hasAlgorithm(__func__)) {
1099
+ const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
1100
+ s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
1101
+ } else {
1102
+ s_algo = komputeManager()->getAlgorithm(__func__);
1103
+ s_algo->setTensors({inA, inB, out});
1104
+ s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
1105
+ s_algo->setPushConstants<PushConstants>({pushConsts});
1106
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
1107
+ }
1108
+ seq.record<kp::OpAlgoDispatch>(s_algo);
1109
+ }
1110
+
1111
+ static void ggml_vk_get_rows(
1112
+ const std::vector<uint32_t>& spirv,
1113
+ const char * suffix,
1114
+ unsigned element_size, unsigned qk,
1115
+ kp::Sequence& seq,
1116
+ const std::shared_ptr<kp::Tensor>& inA,
1117
+ const std::shared_ptr<kp::Tensor>& inB,
1118
+ const std::shared_ptr<kp::Tensor>& out,
1119
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1120
+ int32_t ne00, int32_t nb01, int32_t nb1,
1121
+ uint32_t size
1122
+ ) {
1123
+ GGML_ASSERT(nb01%element_size == 0);
1124
+ GGML_ASSERT(nb1%sizeof(float) == 0);
1125
+ if (qk) GGML_ASSERT(ne00%qk == 0);
1126
+
1127
+ struct PushConstants {
1128
+ uint32_t inAOff, inBOff, outOff;
1129
+ int32_t ne00, nb01, nb1;
1130
+ } pushConsts {
1131
+ safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
1132
+ ne00, nb01, nb1
1133
+ };
1134
+
1135
+ auto name = std::string(__func__) + "_" + suffix;
1136
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1137
+ if (!komputeManager()->hasAlgorithm(name)) {
1138
+ s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
1139
+ } else {
1140
+ s_algo = komputeManager()->getAlgorithm(name);
1141
+ s_algo->setTensors({inA, inB, out});
1142
+ s_algo->setWorkgroup({size});
1143
+ s_algo->setPushConstants<PushConstants>({pushConsts});
1144
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
1145
+ }
1146
+ seq.record<kp::OpAlgoDispatch>(s_algo);
1147
+ }
1148
+
1149
+ template <typename... Args>
1150
+ static void ggml_vk_get_rows_f16(Args&&... args) {
1151
+ const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
1152
+ kp::shader_data::op_getrows_f16_comp_spv_len);
1153
+
1154
+ ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward<Args>(args)...);
1155
+ }
1156
+
1157
+ template <typename... Args>
1158
+ static void ggml_vk_get_rows_q4_0(Args&&... args) {
1159
+ const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
1160
+ kp::shader_data::op_getrows_q4_0_comp_spv_len);
1161
+
1162
+ ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
1163
+ }
1164
+
1165
+ template <typename... Args>
1166
+ static void ggml_vk_get_rows_q4_1(Args&&... args) {
1167
+ const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
1168
+ kp::shader_data::op_getrows_q4_1_comp_spv_len);
1169
+
1170
+ ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
1171
+ }
1172
+
1173
+ template <typename... Args>
1174
+ static void ggml_vk_get_rows_q6_k(Args&&... args) {
1175
+ const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
1176
+ kp::shader_data::op_getrows_q6_k_comp_spv_len);
1177
+ ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
1178
+ }
1179
+
1180
+ static void ggml_vk_rope(
1181
+ kp::Sequence& seq,
1182
+ const std::shared_ptr<kp::Tensor>& inA,
1183
+ const std::shared_ptr<kp::Tensor>& inB,
1184
+ const std::shared_ptr<kp::Tensor>& out,
1185
+ uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1186
+ ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_orig_ctx,
1187
+ float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
1188
+ int32_t ne01, int32_t ne02, int32_t ne03,
1189
+ uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
1190
+ int32_t ne0,
1191
+ uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
1192
+ ) {
1193
+ GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
1194
+
1195
+ static const auto spirv_f16 = getSpirvShader(
1196
+ kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len
1197
+ );
1198
+ static const auto spirv_f32 = getSpirvShader(
1199
+ kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len
1200
+ );
1201
+
1202
+ int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
1203
+
1204
+ GGML_ASSERT(nb03 % type_size == 0);
1205
+ GGML_ASSERT(nb02 % type_size == 0);
1206
+ GGML_ASSERT(nb01 % type_size == 0);
1207
+ GGML_ASSERT(nb00 % type_size == 0);
1208
+ GGML_ASSERT(nb3 % type_size == 0);
1209
+ GGML_ASSERT(nb2 % type_size == 0);
1210
+ GGML_ASSERT(nb1 % type_size == 0);
1211
+ GGML_ASSERT(nb0 % type_size == 0);
1212
+
1213
+ struct PushConstants {
1214
+ uint32_t inAOff, inBOff, outOff;
1215
+ int32_t n_dims, mode, n_orig_ctx;
1216
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1217
+ uint32_t nb00, nb01, nb02, nb03;
1218
+ int32_t ne0;
1219
+ uint32_t nb0, nb1, nb2, nb3;
1220
+ } pushConsts {
1221
+ safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
1222
+ n_dims, mode, n_orig_ctx,
1223
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
1224
+ nb00, nb01, nb02, nb03,
1225
+ ne0,
1226
+ nb0, nb1, nb2, nb3
1227
+ };
1228
+
1229
+ auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
1230
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1231
+ if (!komputeManager()->hasAlgorithm(name)) {
1232
+ s_algo = komputeManager()->algorithm<float, PushConstants>(
1233
+ name, s_kompute_context->pool.get(), {inA, inB, out},
1234
+ src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
1235
+ {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
1236
+ );
1237
+ } else {
1238
+ s_algo = komputeManager()->getAlgorithm(name);
1239
+ s_algo->setTensors({inA, inB, out});
1240
+ s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
1241
+ s_algo->setPushConstants<PushConstants>({pushConsts});
1242
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
1243
+ }
1244
+ seq.record<kp::OpAlgoDispatch>(s_algo);
1245
+ }
1246
+
1247
+ static void ggml_vk_cpy(
1248
+ const std::vector<uint32_t>& spirv,
1249
+ uint32_t in_element_size, uint32_t out_element_size,
1250
+ kp::Sequence& seq,
1251
+ const std::shared_ptr<kp::Tensor>& in,
1252
+ const std::shared_ptr<kp::Tensor>& out,
1253
+ uint32_t inOff, uint32_t outOff,
1254
+ int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
1255
+ uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
1256
+ int32_t ne0, int32_t ne1, int32_t ne2,
1257
+ uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
1258
+ ) {
1259
+ struct PushConstants {
1260
+ uint32_t inOff, outOff;
1261
+ int32_t ne00, ne01, ne02;
1262
+ uint32_t nb00, nb01, nb02, nb03;
1263
+ int32_t ne0, ne1, ne2;
1264
+ uint32_t nb0, nb1, nb2, nb3;
1265
+ } pushConsts {
1266
+ safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size),
1267
+ ne00, ne01, ne02,
1268
+ nb00, nb01, nb02, nb03,
1269
+ ne0, ne1, ne2,
1270
+ nb0, nb1, nb2, nb3
1271
+ };
1272
+
1273
+ std::string name = std::string(__func__)
1274
+ + "_i_" + std::to_string(in_element_size)
1275
+ + "_o_" + std::to_string(out_element_size);
1276
+ std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1277
+ if (!komputeManager()->hasAlgorithm(name))
1278
+ s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
1279
+ else {
1280
+ s_algo = komputeManager()->getAlgorithm(name);
1281
+ s_algo->setTensors({in, out});
1282
+ s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
1283
+ s_algo->setPushConstants<PushConstants>({pushConsts});
1284
+ s_algo->updateDescriptors(s_kompute_context->pool.get());
1285
+ }
1286
+ seq.record<kp::OpAlgoDispatch>(s_algo);
1287
+ }
1288
+
1289
+ template <typename... Args>
1290
+ static void ggml_vk_cpy_f32_f16(Args&&... args) {
1291
+ const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
1292
+ kp::shader_data::op_cpy_f32_f16_comp_spv_len);
1293
+ ggml_vk_cpy(spirv, 4, 2, std::forward<Args>(args)...);
1294
+ }
1295
+
1296
+ template <typename... Args>
1297
+ static void ggml_vk_cpy_f32_f32(Args&&... args) {
1298
+ const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
1299
+ kp::shader_data::op_cpy_f32_f32_comp_spv_len);
1300
+ ggml_vk_cpy(spirv, 4, 4, std::forward<Args>(args)...);
1301
+ }
1302
+
1303
+ template <typename... Args>
1304
+ static void ggml_vk_cpy_f16_f16(Args&&... args) {
1305
+ const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
1306
+ kp::shader_data::op_cpy_f16_f16_comp_spv_len);
1307
+ ggml_vk_cpy(spirv, 2, 2, std::forward<Args>(args)...);
1308
+ }
1309
+
1310
+ template <typename... Args>
1311
+ static void ggml_vk_cpy_f16_f32(Args&&... args) {
1312
+ const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
1313
+ kp::shader_data::op_cpy_f16_f32_comp_spv_len);
1314
+ ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
1315
+ }
1316
+
1317
+ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
1318
+ switch (op->type) {
1319
+ case GGML_TYPE_F16:
1320
+ case GGML_TYPE_F32:
1321
+ case GGML_TYPE_Q4_0:
1322
+ case GGML_TYPE_Q4_1:
1323
+ break;
1324
+ default:
1325
+ return false;
1326
+ }
1327
+
1328
+ switch (op->op) {
1329
+ case GGML_OP_UNARY:
1330
+ switch (ggml_get_unary_op(op)) {
1331
+ case GGML_UNARY_OP_RELU:
1332
+ case GGML_UNARY_OP_GELU:
1333
+ case GGML_UNARY_OP_SILU:
1334
+ return true;
1335
+ default:
1336
+ ;
1337
+ }
1338
+ break;
1339
+ case GGML_OP_NONE:
1340
+ case GGML_OP_RESHAPE:
1341
+ case GGML_OP_VIEW:
1342
+ case GGML_OP_TRANSPOSE:
1343
+ case GGML_OP_PERMUTE:
1344
+ case GGML_OP_ADD:
1345
+ case GGML_OP_MUL:
1346
+ case GGML_OP_SCALE:
1347
+ case GGML_OP_SOFT_MAX:
1348
+ case GGML_OP_RMS_NORM:
1349
+ case GGML_OP_NORM:
1350
+ case GGML_OP_ROPE:
1351
+ return true;
1352
+ case GGML_OP_DUP:
1353
+ case GGML_OP_CPY:
1354
+ case GGML_OP_CONT:
1355
+ switch (op->src[0]->type) {
1356
+ case GGML_TYPE_F32:
1357
+ case GGML_TYPE_F16:
1358
+ break;
1359
+ default:
1360
+ return false;
1361
+ }
1362
+ switch (op->type) {
1363
+ case GGML_TYPE_F32:
1364
+ case GGML_TYPE_F16:
1365
+ break;
1366
+ default:
1367
+ return false;
1368
+ }
1369
+ return true;
1370
+ case GGML_OP_DIAG_MASK_INF:
1371
+ return op->ne[3] == 1;
1372
+ case GGML_OP_GET_ROWS:
1373
+ switch (op->src[0]->type) {
1374
+ case GGML_TYPE_F16:
1375
+ case GGML_TYPE_Q4_0:
1376
+ case GGML_TYPE_Q4_1:
1377
+ case GGML_TYPE_Q6_K:
1378
+ return op->ne[2] == 1 && op->ne[3] == 1;
1379
+ default:
1380
+ ;
1381
+ }
1382
+ return false;
1383
+ case GGML_OP_MUL_MAT:
1384
+ if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1]))
1385
+ return false;
1386
+
1387
+ switch (op->src[0]->type) {
1388
+ case GGML_TYPE_F32:
1389
+ case GGML_TYPE_Q6_K:
1390
+ return op->ne[3] == 1;
1391
+ case GGML_TYPE_F16:
1392
+ case GGML_TYPE_Q8_0:
1393
+ case GGML_TYPE_Q4_0:
1394
+ case GGML_TYPE_Q4_1:
1395
+ return true;
1396
+ default:
1397
+ ;
1398
+ }
1399
+ default:
1400
+ ;
1401
+ }
1402
+ return false;
1403
+ }
1404
+
1405
+ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
1406
+ const int n_seq = 8;
1407
+
1408
+ // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
1409
+ // it to the size of the graph, but I think it can be made smaller?
1410
+ ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes);
1411
+
1412
+ std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
1413
+
1414
+ for (auto& sequence : sequences) {
1415
+ sequence = komputeManager()->sequence();
1416
+ }
1417
+ for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
1418
+ const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
1419
+
1420
+ auto& seq = *sequences[seq_idx];
1421
+
1422
+ const int node_start = (seq_idx + 0) * n_nodes_per_seq;
1423
+ const int node_end = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes);
1424
+
1425
+ bool any_commands_recorded = false;
1426
+
1427
+ for (int i = node_start; i < node_end; ++i) {
1428
+ struct ggml_tensor * src0 = gf->nodes[i]->src[0];
1429
+ struct ggml_tensor * src1 = gf->nodes[i]->src[1];
1430
+ struct ggml_tensor * dst = gf->nodes[i];
1431
+ GGML_ASSERT(dst->data != nullptr);
1432
+
1433
+ switch (dst->op) {
1434
+ case GGML_OP_NONE:
1435
+ case GGML_OP_RESHAPE:
1436
+ case GGML_OP_VIEW:
1437
+ case GGML_OP_TRANSPOSE:
1438
+ case GGML_OP_PERMUTE:
1439
+ continue; // noop -> next node
1440
+ default:
1441
+ break;
1442
+ }
1443
+
1444
+ any_commands_recorded = true;
1445
+
1446
+ if (!ggml_vk_supports_op(dst)) {
1447
+ fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
1448
+ GGML_ASSERT(!"unsupported op");
1449
+ }
1450
+
1451
+ const int32_t ne00 = src0 ? src0->ne[0] : 0;
1452
+ const int32_t ne01 = src0 ? src0->ne[1] : 0;
1453
+ const int32_t ne02 = src0 ? src0->ne[2] : 0;
1454
+ const int32_t ne03 = src0 ? src0->ne[3] : 0;
1455
+
1456
+ const uint32_t nb00 = src0 ? src0->nb[0] : 0;
1457
+ const uint32_t nb01 = src0 ? src0->nb[1] : 0;
1458
+ const uint32_t nb02 = src0 ? src0->nb[2] : 0;
1459
+ const uint32_t nb03 = src0 ? src0->nb[3] : 0;
1460
+
1461
+ const int32_t ne10 = src1 ? src1->ne[0] : 0;
1462
+ const int32_t ne11 = src1 ? src1->ne[1] : 0;
1463
+ const int32_t ne12 = src1 ? src1->ne[2] : 0;
1464
+ const int32_t ne13 = src1 ? src1->ne[3] : 0;
1465
+
1466
+ const uint32_t nb10 = src1 ? src1->nb[0] : 0;
1467
+ const uint32_t nb11 = src1 ? src1->nb[1] : 0;
1468
+ const uint32_t nb12 = src1 ? src1->nb[2] : 0;
1469
+ const uint32_t nb13 = src1 ? src1->nb[3] : 0;
1470
+
1471
+ const int32_t ne0 = dst ? dst->ne[0] : 0;
1472
+ const int32_t ne1 = dst ? dst->ne[1] : 0;
1473
+ const int32_t ne2 = dst ? dst->ne[2] : 0;
1474
+ // const int32_t ne3 = dst ? dst->ne[3] : 0;
1475
+
1476
+ const uint32_t nb0 = dst ? dst->nb[0] : 0;
1477
+ const uint32_t nb1 = dst ? dst->nb[1] : 0;
1478
+ const uint32_t nb2 = dst ? dst->nb[2] : 0;
1479
+ const uint32_t nb3 = dst ? dst->nb[3] : 0;
1480
+
1481
+ const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
1482
+ const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
1483
+ const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
1484
+
1485
+ const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
1486
+ uint32_t off_src0 = 0;
1487
+ uint32_t off_src1 = 0;
1488
+ uint32_t off_dst = 0;
1489
+ const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
1490
+ const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
1491
+ const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
1492
+
1493
+ switch (dst->op) {
1494
+ case GGML_OP_ADD:
1495
+ {
1496
+ if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
1497
+ // src1 is a row
1498
+ ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
1499
+ } else {
1500
+ ggml_vk_add(
1501
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1502
+ ne00, ne01, ne02, ne03,
1503
+ nb00, nb01, nb02, nb03,
1504
+ ne10, ne11, ne12, ne13,
1505
+ nb10, nb11, nb12, nb13,
1506
+ ne0,
1507
+ nb0, nb1, nb2, nb3
1508
+ );
1509
+ }
1510
+ } break;
1511
+ case GGML_OP_MUL:
1512
+ {
1513
+ ggml_vk_mul(
1514
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1515
+ ne00, ne01, ne02, ne03,
1516
+ nb00, nb01, nb02, nb03,
1517
+ ne10, ne11, ne12, ne13,
1518
+ nb10, nb11, nb12, nb13,
1519
+ ne0,
1520
+ nb0, nb1, nb2, nb3
1521
+ );
1522
+ } break;
1523
+ case GGML_OP_SCALE:
1524
+ {
1525
+ float scale; memcpy(&scale, dst->op_params, sizeof(float));
1526
+
1527
+ ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
1528
+ } break;
1529
+ case GGML_OP_UNARY:
1530
+ {
1531
+ int64_t n = ggml_nelements(dst);
1532
+ GGML_ASSERT(n % 4 == 0);
1533
+ switch (ggml_get_unary_op(gf->nodes[i])) {
1534
+ case GGML_UNARY_OP_SILU:
1535
+ {
1536
+ ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
1537
+ } break;
1538
+ case GGML_UNARY_OP_RELU:
1539
+ {
1540
+ ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
1541
+ } break;
1542
+ case GGML_UNARY_OP_GELU:
1543
+ {
1544
+ GGML_ASSERT(n % 8 == 0);
1545
+ ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8);
1546
+ } break;
1547
+ default:
1548
+ {
1549
+ fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1550
+ GGML_ASSERT(false);
1551
+ }
1552
+ }
1553
+ } break;
1554
+ case GGML_OP_SOFT_MAX:
1555
+ {
1556
+ float scale;
1557
+ memcpy(&scale, dst->op_params, sizeof(float));
1558
+ ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
1559
+ } break;
1560
+ case GGML_OP_DIAG_MASK_INF:
1561
+ {
1562
+ const int n_past = ((int32_t *)(dst->op_params))[0];
1563
+ ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
1564
+ } break;
1565
+ case GGML_OP_NORM:
1566
+ {
1567
+ float eps;
1568
+ memcpy(&eps, dst->op_params, sizeof(float));
1569
+ ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
1570
+ } break;
1571
+ case GGML_OP_RMS_NORM:
1572
+ {
1573
+ GGML_ASSERT(ne00 % 4 == 0);
1574
+
1575
+ float eps;
1576
+ memcpy(&eps, dst->op_params, sizeof(float));
1577
+ ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
1578
+ } break;
1579
+ case GGML_OP_MUL_MAT:
1580
+ {
1581
+ GGML_ASSERT(ne00 == ne10);
1582
+
1583
+ // TODO: assert that dim2 and dim3 are contiguous
1584
+ GGML_ASSERT(ne12 % ne02 == 0);
1585
+ GGML_ASSERT(ne13 % ne03 == 0);
1586
+
1587
+ const uint32_t r2 = ne12/ne02;
1588
+ const uint32_t r3 = ne13/ne03;
1589
+
1590
+ if (src1t != GGML_TYPE_F32) {
1591
+ fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
1592
+ goto not_implemented;
1593
+ }
1594
+
1595
+ if (ggml_is_transposed(src0) ||
1596
+ ggml_is_transposed(src1)) {
1597
+ fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
1598
+ goto not_implemented;
1599
+ }
1600
+
1601
+ switch (src0t) {
1602
+ case GGML_TYPE_F32:
1603
+ ggml_vk_mul_mat_mat_f32(
1604
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1605
+ ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2
1606
+ );
1607
+ break;
1608
+ case GGML_TYPE_F16:
1609
+ ggml_vk_mul_mat_f16(
1610
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1611
+ ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12,
1612
+ ne0, ne1, r2, r3
1613
+ );
1614
+ break;
1615
+ case GGML_TYPE_Q8_0:
1616
+ ggml_vk_mul_mat_q8_0(
1617
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1618
+ ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
1619
+ );
1620
+ break;
1621
+ case GGML_TYPE_Q4_0:
1622
+ ggml_vk_mul_mat_q4_0(
1623
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1624
+ ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
1625
+ );
1626
+ break;
1627
+ case GGML_TYPE_Q4_1:
1628
+ ggml_vk_mul_mat_q4_1(
1629
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1630
+ ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
1631
+ );
1632
+ break;
1633
+ case GGML_TYPE_Q6_K:
1634
+ ggml_vk_mul_mat_q6_k(
1635
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1636
+ ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02
1637
+ );
1638
+ break;
1639
+ default: {
1640
+ fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
1641
+ goto not_implemented;
1642
+ }
1643
+ }
1644
+
1645
+ } break;
1646
+ case GGML_OP_GET_ROWS:
1647
+ {
1648
+ if (src0t == GGML_TYPE_F16) {
1649
+ ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1650
+ } else if (src0t == GGML_TYPE_Q4_0) {
1651
+ ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1652
+ } else if (src0t == GGML_TYPE_Q4_1) {
1653
+ ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1654
+ } else if (src0t == GGML_TYPE_Q6_K) {
1655
+ ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1656
+ } else {
1657
+ fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
1658
+ goto not_implemented;
1659
+ }
1660
+ } break;
1661
+ case GGML_OP_ROPE:
1662
+ {
1663
+ GGML_ASSERT(ne10 == ne02);
1664
+ GGML_ASSERT(src0t == dstt);
1665
+ // const int n_past = ((int32_t *) dst->op_params)[0];
1666
+ const int n_dims = ((int32_t *) dst->op_params)[1];
1667
+ const int mode = ((int32_t *) dst->op_params)[2];
1668
+ // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
1669
+ const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
1670
+
1671
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1672
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
1673
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
1674
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
1675
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
1676
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1677
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1678
+ ggml_vk_rope(
1679
+ seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_orig_ctx,
1680
+ freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
1681
+ ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
1682
+ );
1683
+ } break;
1684
+ case GGML_OP_DUP:
1685
+ case GGML_OP_CPY:
1686
+ case GGML_OP_CONT:
1687
+ {
1688
+ switch (src0t) {
1689
+ case GGML_TYPE_F32:
1690
+ {
1691
+ switch (dstt) {
1692
+ case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
1693
+ case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
1694
+ default: goto not_implemented;
1695
+ }
1696
+ } break;
1697
+ case GGML_TYPE_F16:
1698
+ {
1699
+ switch (dstt) {
1700
+ case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
1701
+ case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
1702
+ default: goto not_implemented;
1703
+ } break;
1704
+ default: goto not_implemented;
1705
+ }
1706
+ }
1707
+ } break;
1708
+ default: goto not_implemented;
1709
+ }
1710
+ continue;
1711
+ not_implemented: {}
1712
+ fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1713
+ //GGML_ASSERT(false);
1714
+ }
1715
+
1716
+ // Evaluate sequence
1717
+ if (any_commands_recorded) {
1718
+ seq.evalAsync();
1719
+ }
1720
+ }
1721
+
1722
+ // Wait for all sequences to finish
1723
+ for (auto& sequence : sequences) {
1724
+ if (sequence->isRunning())
1725
+ sequence->evalAwait();
1726
+ }
1727
+
1728
+ ggml_vk_free_descriptor_pool(ctx);
1729
+ }
1730
+
1731
+ template<>
1732
+ kp::Tensor::TensorDataTypes
1733
+ kp::TensorT<half>::dataType()
1734
+ {
1735
+ return TensorDataTypes::eFloat;
1736
+ }
1737
+
1738
+ template<>
1739
+ kp::Tensor::TensorDataTypes
1740
+ kp::TensorT<uint8_t>::dataType()
1741
+ {
1742
+ return TensorDataTypes::eUnsignedInt;
1743
+ }
1744
+
1745
+ ////////////////////////////////////////////////////////////////////////////////
1746
+
1747
+ // backend interface
1748
+
1749
+ struct ggml_backend_kompute_buffer_type_context {
1750
+ int device;
1751
+ int device_ref = 0;
1752
+ uint64_t buffer_alignment;
1753
+ uint64_t max_alloc;
1754
+ std::string name;
1755
+
1756
+ ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc)
1757
+ : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {}
1758
+ };
1759
+
1760
+ static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
1761
+ auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1762
+
1763
+ if (!ctx->device_ref) {
1764
+ komputeManager()->initializeDevice(
1765
+ ctx->device, {}, {
1766
+ "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
1767
+ "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
1768
+ }
1769
+ );
1770
+ }
1771
+
1772
+ assert(ggml_vk_has_device());
1773
+ ctx->device_ref++;
1774
+ }
1775
+
1776
+ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
1777
+ auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1778
+
1779
+ assert(ctx->device_ref > 0);
1780
+
1781
+ ctx->device_ref--;
1782
+
1783
+ if (!ctx->device_ref) {
1784
+ komputeManager.destroy();
1785
+ }
1786
+ }
1787
+
1788
+ static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
1789
+ auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
1790
+ return ctx->name.c_str();
1791
+ }
1792
+
1793
+ static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1794
+ auto * memory = (ggml_vk_memory *)buffer->context;
1795
+ if (ggml_vk_has_device()) {
1796
+ ggml_vk_free_memory(*memory);
1797
+ }
1798
+ delete memory;
1799
+ }
1800
+
1801
+ static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
1802
+ return ((ggml_vk_memory *)buffer->context)->data;
1803
+ }
1804
+
1805
+ static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1806
+ GGML_UNUSED(buffer);
1807
+
1808
+ const auto res = ggml_vk_get_tensor(tensor);
1809
+ GGML_ASSERT(res);
1810
+
1811
+ memcpy((char *)tensor->data + offset, data, size);
1812
+
1813
+ komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
1814
+ }
1815
+
1816
+ static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1817
+ GGML_UNUSED(buffer);
1818
+
1819
+ const auto res = ggml_vk_get_tensor(tensor);
1820
+ GGML_ASSERT(res);
1821
+
1822
+ komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
1823
+
1824
+ memcpy(data, (const char *)tensor->data + offset, size);
1825
+ }
1826
+
1827
+ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1828
+ auto * memory = (ggml_vk_memory *)buffer->context;
1829
+ memset(memory->data, value, buffer->size);
1830
+
1831
+ if (memory->stagingBuffer)
1832
+ komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory->primaryBuffer, memory->stagingBuffer, memory->size);
1833
+ }
1834
+
1835
+ static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
1836
+ /* .get_name = */ ggml_backend_kompute_buffer_get_name,
1837
+ /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
1838
+ /* .get_base = */ ggml_backend_kompute_buffer_get_base,
1839
+ /* .init_tensor = */ NULL,
1840
+ /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor,
1841
+ /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor,
1842
+ /* .cpy_tensor = */ NULL,
1843
+ /* .clear = */ ggml_backend_kompute_buffer_clear,
1844
+ /* .reset = */ NULL,
1845
+ };
1846
+
1847
+ // default buffer type
1848
+
1849
+ static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
1850
+ auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1851
+ return ctx->name.c_str();
1852
+ }
1853
+
1854
+ static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1855
+ ggml_backend_kompute_device_ref(buft);
1856
+ auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
1857
+ return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
1858
+ }
1859
+
1860
+ static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1861
+ auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1862
+ return ctx->buffer_alignment;
1863
+ }
1864
+
1865
+ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
1866
+ auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1867
+ return ctx->max_alloc;
1868
+ }
1869
+
1870
+ static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
1871
+ GGML_UNUSED(buft);
1872
+ return ggml_backend_is_kompute(backend);
1873
+ }
1874
+
1875
+ static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
1876
+ /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
1877
+ /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
1878
+ /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
1879
+ /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
1880
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1881
+ /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
1882
+ /* .is_host = */ NULL,
1883
+ };
1884
+
1885
+ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
1886
+ static std::vector<ggml_backend_buffer_type> bufts = []() {
1887
+ std::vector<ggml_backend_buffer_type> vec;
1888
+ auto devices = ggml_vk_available_devices_internal(0);
1889
+ vec.reserve(devices.size());
1890
+
1891
+ for (const auto & dev : devices) {
1892
+ vec.push_back({
1893
+ /* .iface = */ ggml_backend_kompute_buffer_type_interface,
1894
+ /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
1895
+ });
1896
+ }
1897
+ return vec;
1898
+ }();
1899
+
1900
+ auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) {
1901
+ return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
1902
+ });
1903
+ return it < bufts.end() ? &*it : nullptr;
1904
+ }
1905
+
1906
+ // backend
1907
+
1908
+ static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
1909
+ auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1910
+ return ctx->name.c_str();
1911
+ }
1912
+
1913
+ static void ggml_backend_kompute_free(ggml_backend_t backend) {
1914
+ auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1915
+
1916
+ assert(ctx == s_kompute_context);
1917
+ s_kompute_context = nullptr;
1918
+ if (ctx != nullptr) {
1919
+ delete ctx;
1920
+ }
1921
+
1922
+ delete backend;
1923
+ }
1924
+
1925
+ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
1926
+ auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1927
+ return ggml_backend_kompute_buffer_type(ctx->device);
1928
+ }
1929
+
1930
+ static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
1931
+ auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
1932
+ ggml_vk_graph_compute(ctx, cgraph);
1933
+ return true;
1934
+ }
1935
+
1936
+ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
1937
+ GGML_UNUSED(backend);
1938
+ return ggml_vk_supports_op(op);
1939
+ }
1940
+
1941
+ static struct ggml_backend_i kompute_backend_i = {
1942
+ /* .get_name = */ ggml_backend_kompute_name,
1943
+ /* .free = */ ggml_backend_kompute_free,
1944
+ /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
1945
+ /* .set_tensor_async = */ NULL,
1946
+ /* .get_tensor_async = */ NULL,
1947
+ /* .cpy_tensor_async = */ NULL,
1948
+ /* .synchronize = */ NULL,
1949
+ /* .graph_plan_create = */ NULL,
1950
+ /* .graph_plan_free = */ NULL,
1951
+ /* .graph_plan_compute = */ NULL,
1952
+ /* .graph_compute = */ ggml_backend_kompute_graph_compute,
1953
+ /* .supports_op = */ ggml_backend_kompute_supports_op,
1954
+ };
1955
+
1956
+ ggml_backend_t ggml_backend_kompute_init(int device) {
1957
+ GGML_ASSERT(s_kompute_context == nullptr);
1958
+ s_kompute_context = new ggml_kompute_context(device);
1959
+
1960
+ ggml_backend_t kompute_backend = new ggml_backend {
1961
+ /* .interface = */ kompute_backend_i,
1962
+ /* .context = */ s_kompute_context,
1963
+ };
1964
+
1965
+ return kompute_backend;
1966
+ }
1967
+
1968
+ bool ggml_backend_is_kompute(ggml_backend_t backend) {
1969
+ return backend && backend->iface.get_name == ggml_backend_kompute_name;
1970
+ }
1971
+
1972
+ static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
1973
+ GGML_UNUSED(params);
1974
+ return ggml_backend_kompute_init(intptr_t(user_data));
1975
+ }
1976
+
1977
+ extern "C" int ggml_backend_kompute_reg_devices();
1978
+
1979
+ int ggml_backend_kompute_reg_devices() {
1980
+ auto devices = ggml_vk_available_devices_internal(0);
1981
+ for (const auto & device : devices) {
1982
+ ggml_backend_register(
1983
+ ggml_kompute_format_name(device.index).c_str(),
1984
+ ggml_backend_reg_kompute_init,
1985
+ ggml_backend_kompute_buffer_type(device.index),
1986
+ reinterpret_cast<void *>(intptr_t(device.index))
1987
+ );
1988
+ }
1989
+ return devices.size();
1990
+ }
ggml-kompute.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #include <stdbool.h>
7
+ #include <stddef.h>
8
+ #include <stdint.h>
9
+
10
+ #ifdef __cplusplus
11
+ extern "C" {
12
+ #endif
13
+
14
+ struct ggml_vk_device {
15
+ int index;
16
+ int type; // same as VkPhysicalDeviceType
17
+ size_t heapSize;
18
+ const char * name;
19
+ const char * vendor;
20
+ int subgroupSize;
21
+ uint64_t bufferAlignment;
22
+ uint64_t maxAlloc;
23
+ };
24
+
25
+ struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
26
+ bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
27
+ bool ggml_vk_has_vulkan(void);
28
+ bool ggml_vk_has_device(void);
29
+ struct ggml_vk_device ggml_vk_current_device(void);
30
+
31
+ //
32
+ // backend API
33
+ //
34
+
35
+ // forward declaration
36
+ typedef struct ggml_backend * ggml_backend_t;
37
+
38
+ GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
39
+
40
+ GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
41
+
42
+ GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
43
+
44
+ #ifdef __cplusplus
45
+ }
46
+ #endif
ggml-sycl.cpp ADDED
The diff for this file is too large to render. See raw diff
 
ggml-sycl.h ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // MIT license
3
+ // Copyright (C) 2024 Intel Corporation
4
+ // SPDX-License-Identifier: MIT
5
+ //
6
+
7
+ #pragma once
8
+
9
+ #include "ggml.h"
10
+ #include "ggml-backend.h"
11
+
12
+ #ifdef __cplusplus
13
+ extern "C" {
14
+ #endif
15
+
16
+ #define GGML_SYCL_MAX_DEVICES 16
17
+ #define GGML_SYCL_NAME "SYCL"
18
+
19
+ GGML_API void ggml_init_sycl(void);
20
+ GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
21
+ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
22
+ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
23
+ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
24
+ GGML_API void ggml_backend_sycl_print_sycl_devices(void);
25
+ GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
26
+ GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
27
+ #ifdef __cplusplus
28
+ }
29
+ #endif
ggml-vulkan.cpp ADDED
The diff for this file is too large to render. See raw diff
 
ggml-vulkan.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ #define GGML_VK_NAME "Vulkan"
11
+ #define GGML_VK_MAX_DEVICES 16
12
+
13
+ GGML_API void ggml_vk_init_cpu_assist(void);
14
+
15
+ GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
16
+ GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
17
+ GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
18
+ GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
19
+ #ifdef GGML_VULKAN_CHECK_RESULTS
20
+ void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
21
+ #endif
22
+ GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
23
+ GGML_API void ggml_vk_free_cpu_assist(void);
24
+
25
+ // backend API
26
+ GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
27
+
28
+ GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
29
+ GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
30
+ GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
31
+ GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
32
+
33
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
34
+ // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
35
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
36
+
37
+ #ifdef __cplusplus
38
+ }
39
+ #endif