ggerganov commited on
Commit
d321914
·
unverified ·
1 Parent(s): 844e617

ggml : remove old kompute, cann (skip) (#3349)

Browse files
Files changed (50) hide show
  1. ggml/include/ggml-kompute.h +0 -50
  2. ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  3. ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  4. ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  5. ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  6. ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  7. ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  8. ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  9. ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  10. ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  11. ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  12. ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  13. ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  14. ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  15. ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  16. ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  17. ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  18. ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  19. ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  20. ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  21. ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  22. ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  23. ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  24. ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  25. ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  26. ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  27. ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  28. ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  29. ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  30. ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  31. ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  32. ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  33. ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  34. ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  35. ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  36. ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  37. ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  38. ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  39. ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  40. ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  41. ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  42. ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  43. ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  44. ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  45. ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  46. ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  47. ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  48. ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  49. ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  50. ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
ggml/include/ggml-kompute.h DELETED
@@ -1,50 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
- #include "ggml-backend.h"
5
-
6
- #include <stdbool.h>
7
- #include <stddef.h>
8
- #include <stdint.h>
9
-
10
- #ifdef __cplusplus
11
- extern "C" {
12
- #endif
13
-
14
- #define GGML_KOMPUTE_MAX_DEVICES 16
15
-
16
- struct ggml_vk_device {
17
- int index;
18
- int type; // same as VkPhysicalDeviceType
19
- size_t heapSize;
20
- const char * name;
21
- const char * vendor;
22
- int subgroupSize;
23
- uint64_t bufferAlignment;
24
- uint64_t maxAlloc;
25
- };
26
-
27
- struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
28
- bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
29
- bool ggml_vk_has_vulkan(void);
30
- bool ggml_vk_has_device(void);
31
- struct ggml_vk_device ggml_vk_current_device(void);
32
-
33
- //
34
- // backend API
35
- //
36
-
37
- // forward declaration
38
- typedef struct ggml_backend * ggml_backend_t;
39
-
40
- GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
41
-
42
- GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
43
-
44
- GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
45
-
46
- GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
47
-
48
- #ifdef __cplusplus
49
- }
50
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/CMakeLists.txt DELETED
@@ -1,30 +0,0 @@
1
- file(GLOB SRC_FILES
2
- get_row_f32.cpp
3
- get_row_f16.cpp
4
- get_row_q4_0.cpp
5
- get_row_q8_0.cpp
6
- quantize_f32_q8_0.cpp
7
- quantize_f16_q8_0.cpp
8
- quantize_float_to_q4_0.cpp
9
- dup.cpp
10
- )
11
-
12
- set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
13
- set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
14
-
15
- if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
16
- set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
17
- elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
18
- set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
19
- else()
20
- message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
21
- endif()
22
- include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
23
-
24
- ascendc_library(ascendc_kernels STATIC
25
- ${SRC_FILES}
26
- )
27
-
28
- message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
29
- ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
30
- # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/ascendc_kernels.h DELETED
@@ -1,19 +0,0 @@
1
- #ifndef ASCENDC_KERNELS_H
2
- #define ASCENDC_KERNELS_H
3
-
4
- #include "aclrtlaunch_ascendc_get_row_f32.h"
5
- #include "aclrtlaunch_ascendc_get_row_f16.h"
6
- #include "aclrtlaunch_ascendc_get_row_q8_0.h"
7
- #include "aclrtlaunch_ascendc_get_row_q4_0.h"
8
-
9
- #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
10
- #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11
- #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12
- #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
13
-
14
- #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
15
- #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
16
- #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
17
- #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
18
-
19
- #endif // ASCENDC_KERNELS_H
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/dup.cpp DELETED
@@ -1,234 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- using namespace AscendC;
4
-
5
- #define BUFFER_NUM 2
6
- const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
7
-
8
- template <typename SRC_T, typename DST_T>
9
- class DupByRows {
10
- public:
11
- __aicore__ inline DupByRows() {}
12
- __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
13
- size_t *input_nb_ub) {
14
- /* Dup by rows when src is contigous on first dimension and dst is
15
- contiguous, each kernel process one row.
16
- */
17
-
18
- // Input has four dims.
19
- int64_t op_block_num = GetBlockNum();
20
- int64_t op_block_idx = GetBlockIdx();
21
-
22
- // param
23
- num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
24
- num_elem = input_ne_ub[0];
25
-
26
- // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
27
- idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
28
- idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
29
- / (input_ne_ub[1]);
30
- idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
31
- - idx_ne2 * input_ne_ub[1];
32
-
33
- // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
34
- src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
35
- + input_nb_ub[1] * idx_ne1;
36
-
37
- // dst is contiguous
38
- dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
39
-
40
- src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
41
- src_stride));
42
- dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
43
- dst_stride));
44
-
45
- pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
46
- 32 - 1) / 32 * 32);
47
- pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
48
- 32 - 1) / 32 * 32);
49
- }
50
-
51
- __aicore__ inline void copy_in() {
52
- LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
53
- const size_t elem_per_block = 32 / sizeof(SRC_T);
54
- size_t tail = num_elem % elem_per_block;
55
- size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
56
- DataCopy(src_local, src_gm, cpy_elements_len);
57
- src_queue.EnQue(src_local);
58
- }
59
-
60
- __aicore__ inline void copy_out() {
61
- LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
62
- #ifdef ASCEND_310P
63
- const size_t elem_per_block = 32 / sizeof(DST_T);
64
- size_t tail = num_elem % elem_per_block;
65
- size_t len = num_elem & ~(elem_per_block - 1);
66
- if (len > 0) {
67
- DataCopy(dst_gm, dst_local, len);
68
- }
69
- if(tail != 0) {
70
- for (size_t i = tail; i < elem_per_block; i++) {
71
- dst_local[len + i].SetValue(0, 0);
72
- }
73
- SetAtomicAdd<float>();
74
- DataCopy(dst_gm[len], dst_local[len], elem_per_block);
75
- SetAtomicNone();
76
- }
77
- #else
78
- DataCopyExtParams dataCopyParams;
79
- dataCopyParams.blockCount = 1;
80
- dataCopyParams.blockLen = num_elem * sizeof(DST_T);
81
- DataCopyPad(dst_gm, dst_local, dataCopyParams);
82
- #endif
83
- dst_queue.FreeTensor(dst_local);
84
- }
85
-
86
- __aicore__ inline void dup() {
87
- // main process, copy one row data from src to dst.
88
- copy_in();
89
-
90
- LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
91
- LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
92
-
93
- int32_t BLOCK_NUM = 32 / sizeof(DST_T);
94
- DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
95
- / BLOCK_NUM * BLOCK_NUM);
96
- dst_queue.EnQue<DST_T>(dst_local);
97
-
98
- src_queue.FreeTensor(src_local);
99
- copy_out();
100
- }
101
-
102
- __aicore__ inline void dup_with_cast() {
103
- // main process, copy one row data from src to dst.
104
- // cast dtype from src to dst.
105
- copy_in();
106
-
107
- LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
108
- LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
109
-
110
- Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
111
- dst_queue.EnQue<DST_T>(dst_local);
112
-
113
- src_queue.FreeTensor(src_local);
114
- copy_out();
115
- }
116
-
117
- private:
118
-
119
- TPipe pipe;
120
- GlobalTensor<SRC_T> src_gm;
121
- GlobalTensor<DST_T> dst_gm;
122
-
123
- int64_t num_rows;
124
- int64_t num_elem;
125
- int64_t idx_ne3;
126
- int64_t idx_ne2;
127
- int64_t idx_ne1;
128
- int64_t src_stride;
129
- int64_t dst_stride;
130
-
131
- TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
132
- TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
133
- };
134
-
135
- template <typename T>
136
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
137
- auto gm_ptr = (__gm__ uint8_t *)gm;
138
- auto ub_ptr = (uint8_t *)(ub);
139
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
140
- *ub_ptr = *gm_ptr;
141
- }
142
- }
143
-
144
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
145
- GM_ADDR src_gm,
146
- GM_ADDR dst_gm,
147
- GM_ADDR input_ne_gm,
148
- GM_ADDR input_nb_gm,
149
- GM_ADDR output_ne_gm,
150
- GM_ADDR output_nb_gm) {
151
-
152
- int64_t input_ne_ub[4];
153
- size_t input_nb_ub[4];
154
- int64_t output_ne_ub[4];
155
- size_t output_nb_ub[4];
156
-
157
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
158
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
159
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
160
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
161
-
162
- DupByRows<half, half> op;
163
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
164
- op.dup();
165
- }
166
-
167
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
168
- GM_ADDR src_gm,
169
- GM_ADDR dst_gm,
170
- GM_ADDR input_ne_gm,
171
- GM_ADDR input_nb_gm,
172
- GM_ADDR output_ne_gm,
173
- GM_ADDR output_nb_gm) {
174
- int64_t input_ne_ub[4];
175
- size_t input_nb_ub[4];
176
- int64_t output_ne_ub[4];
177
- size_t output_nb_ub[4];
178
-
179
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
180
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
181
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
182
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
183
-
184
- DupByRows<float, float> op;
185
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
186
- op.dup();
187
- }
188
-
189
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
190
- GM_ADDR src_gm,
191
- GM_ADDR dst_gm,
192
- GM_ADDR input_ne_gm,
193
- GM_ADDR input_nb_gm,
194
- GM_ADDR output_ne_gm,
195
- GM_ADDR output_nb_gm) {
196
-
197
- int64_t input_ne_ub[4];
198
- size_t input_nb_ub[4];
199
- int64_t output_ne_ub[4];
200
- size_t output_nb_ub[4];
201
-
202
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
203
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
204
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
205
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
206
-
207
- DupByRows<float, half> op;
208
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
209
- op.dup_with_cast();
210
- }
211
-
212
- extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
213
- GM_ADDR src_gm,
214
- GM_ADDR dst_gm,
215
- GM_ADDR input_ne_gm,
216
- GM_ADDR input_nb_gm,
217
- GM_ADDR output_ne_gm,
218
- GM_ADDR output_nb_gm) {
219
-
220
- // copy params from gm to ub.
221
- int64_t input_ne_ub[4];
222
- size_t input_nb_ub[4];
223
- int64_t output_ne_ub[4];
224
- size_t output_nb_ub[4];
225
-
226
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
227
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
228
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
229
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
230
-
231
- DupByRows<half, float> op;
232
- op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
233
- op.dup_with_cast();
234
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/get_row_f16.cpp DELETED
@@ -1,197 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
-
6
- #define BUFFER_NUM 2
7
-
8
- class GET_ROW_F16 {
9
- public:
10
- __aicore__ inline GET_ROW_F16() {}
11
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
12
- int64_t *input_ne_ub, size_t *input_nb_ub,
13
- int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
- int64_t *output_ne_ub, size_t *output_nb_ub) {
15
- // TODO, use template for F16/f32
16
- int64_t op_block_num = GetBlockNum();
17
- op_block_idx = GetBlockIdx();
18
-
19
- for (int i = 0; i < 4; i++) {
20
- input_ne[i] = input_ne_ub[i];
21
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
22
-
23
- indices_ne[i] = indices_ne_ub[i];
24
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
25
-
26
- output_ne[i] = output_ne_ub[i];
27
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
28
- }
29
-
30
- // Indices has two dims. n_elements = all rows should get.
31
- // dr, all rows should this thread get.
32
- uint64_t n_elements =
33
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
34
- dr = n_elements / op_block_num;
35
-
36
- uint64_t tails = n_elements % op_block_num;
37
- if (op_block_idx < tails) {
38
- dr += 1;
39
- ir = dr * op_block_idx;
40
- } else {
41
- ir = dr * op_block_idx + tails;
42
- }
43
-
44
- input_gm.SetGlobalBuffer((__gm__ half *)input);
45
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
46
- output_gm.SetGlobalBuffer((__gm__ float *)output);
47
-
48
- uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
49
- & ~31);
50
- uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
51
- & ~31);
52
-
53
- local_buffer_elems = input_local_buffer_size / sizeof(half);
54
-
55
- // TODO, consider long row that can't put in UB.
56
- // All data should asign to 32. It's ok because all data is align to 32.
57
- pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
58
- pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
59
- }
60
-
61
- __aicore__ inline void copy_in(uint32_t offset, size_t len) {
62
- size_t origin_len = len;
63
- LocalTensor<half> input_local = input_queue.AllocTensor<half>();
64
- const size_t elem_per_block = 32 / sizeof(half);
65
- size_t tail = len % elem_per_block;
66
- len = len & ~(elem_per_block - 1);
67
- if(tail != 0) {
68
- len += elem_per_block;
69
- }
70
- DataCopy(input_local, input_gm[offset], len);
71
- input_queue.EnQue(input_local);
72
- }
73
-
74
- __aicore__ inline void copy_out(uint32_t offset, size_t len) {
75
- LocalTensor<float> output_local = output_queue.DeQue<float>();
76
- const size_t elem_per_block = 32 / sizeof(float);
77
- size_t tail = len % elem_per_block;
78
- len = len & ~(elem_per_block - 1);
79
- if (len > 0) {
80
- DataCopy(output_gm[offset], output_local, len);
81
- }
82
-
83
- if(tail != 0) {
84
- #ifdef ASCEND_310P
85
- for (size_t i = tail; i < elem_per_block; i++) {
86
- output_local[len + i].SetValue(0, 0);
87
- }
88
- SetAtomicAdd<float>();
89
- DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
90
- SetAtomicNone();
91
- #else
92
- DataCopyExtParams dataCopyParams;
93
- dataCopyParams.blockCount = 1;
94
- dataCopyParams.blockLen = tail * sizeof(float);
95
- DataCopyPad(output_gm[offset + len], output_local[len],
96
- dataCopyParams);
97
- #endif
98
- }
99
- output_queue.FreeTensor(output_local);
100
- }
101
-
102
- __aicore__ inline void calculate_row(int64_t idx) {
103
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
104
- const int64_t indices_ne1_idx =
105
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
106
- indices_ne[0];
107
- const int64_t indices_ne0_idx =
108
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
109
- indices_ne1_idx * indices_ne[0]);
110
-
111
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
112
- indices_ne1_idx * indices_stride[1] +
113
- indices_ne2_idx * indices_stride[2];
114
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
115
-
116
- const int64_t input_offset = selected_row_idx * input_stride[1] +
117
- indices_ne1_idx * input_stride[2] +
118
- indices_ne2_idx * input_stride[3];
119
-
120
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
121
- indices_ne1_idx * output_stride[2] +
122
- indices_ne2_idx * output_stride[3];
123
-
124
- copy_in(input_offset, input_ne[0]);
125
- LocalTensor<half> input_local = input_queue.DeQue<half>();
126
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
127
-
128
- Cast(output_local, input_local, RoundMode::CAST_NONE,
129
- local_buffer_elems);
130
- output_queue.EnQue(output_local);
131
- copy_out(output_offset, input_ne[0]);
132
-
133
- input_queue.FreeTensor(input_local);
134
- }
135
-
136
- __aicore__ inline void calculate() {
137
- for (int64_t i = ir; i < ir + dr; i++) {
138
- calculate_row(i);
139
- }
140
- }
141
-
142
- private:
143
- int64_t input_ne[4];
144
- size_t input_stride[4];
145
-
146
- int64_t indices_ne[4];
147
- size_t indices_stride[4];
148
-
149
- int64_t output_ne[4];
150
- size_t output_stride[4];
151
-
152
- size_t local_buffer_elems;
153
-
154
- int64_t ir;
155
- int64_t dr;
156
-
157
- TPipe pipe;
158
- GlobalTensor<half> input_gm;
159
- GlobalTensor<int32_t> indices_gm;
160
- GlobalTensor<float> output_gm;
161
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
162
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
163
- int64_t op_block_idx;
164
- };
165
-
166
- template <typename T>
167
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
168
- auto gm_ptr = (__gm__ uint8_t *)gm;
169
- auto ub_ptr = (uint8_t *)(ub);
170
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
171
- *ub_ptr = *gm_ptr;
172
- }
173
- }
174
-
175
- extern "C" __global__ __aicore__ void ascendc_get_row_f16(
176
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
177
- GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
178
- GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
179
- int64_t input_ne_ub[4];
180
- size_t input_nb_ub[4];
181
- int64_t indices_ne_ub[4];
182
- size_t indices_nb_ub[4];
183
- int64_t output_ne_ub[4];
184
- size_t output_nb_ub[4];
185
-
186
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
187
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
188
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
189
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
190
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
191
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
192
-
193
- GET_ROW_F16 op;
194
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
195
- indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
196
- op.calculate();
197
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/get_row_f32.cpp DELETED
@@ -1,190 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
-
6
- #define BUFFER_NUM 2
7
-
8
- class GET_ROW_F32 {
9
- public:
10
- __aicore__ inline GET_ROW_F32() {}
11
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
12
- int64_t *input_ne_ub, size_t *input_nb_ub,
13
- int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
- int64_t *output_ne_ub, size_t *output_nb_ub) {
15
- int64_t op_block_num = GetBlockNum();
16
- op_block_idx = GetBlockIdx();
17
-
18
- for (int i = 0; i < 4; i++) {
19
- input_ne[i] = input_ne_ub[i];
20
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
21
-
22
- indices_ne[i] = indices_ne_ub[i];
23
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
24
-
25
- output_ne[i] = output_ne_ub[i];
26
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
27
- }
28
-
29
- // Indices has two dims. n_elements = all rows should get.
30
- // dr, all rows should this thread get.
31
- uint64_t n_elements =
32
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
33
- dr = n_elements / op_block_num;
34
-
35
- uint64_t tails = n_elements % op_block_num;
36
- if (op_block_idx < tails) {
37
- dr += 1;
38
- ir = dr * op_block_idx;
39
- } else {
40
- ir = dr * op_block_idx + tails;
41
- }
42
-
43
- input_gm.SetGlobalBuffer((__gm__ float *)input);
44
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
45
- output_gm.SetGlobalBuffer((__gm__ float *)output);
46
-
47
- uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
48
- local_buffer_elems = local_buffer_size / sizeof(float);
49
-
50
- // TODO, consider long row that can't put in UB.
51
- // All data should asign to 32. It's ok because all data is align to 32.
52
- pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
53
- pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
54
- }
55
-
56
- __aicore__ inline void copy_in(uint32_t offset, size_t len) {
57
- LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58
- const size_t elem_per_block = 32 / sizeof(float);
59
- size_t tail = len % elem_per_block;
60
- len = len & ~(elem_per_block - 1);
61
- if(tail != 0) {
62
- len += elem_per_block;
63
- }
64
- DataCopy(input_local, input_gm[offset], len);
65
- input_queue.EnQue(input_local);
66
- }
67
-
68
- __aicore__ inline void copy_out(uint32_t offset, size_t len) {
69
- LocalTensor<float> output_local = output_queue.DeQue<float>();
70
- const size_t elem_per_block = 32 / sizeof(float);
71
- size_t tail = len % elem_per_block;
72
- len = len & ~(elem_per_block - 1);
73
- if (len > 0) {
74
- DataCopy(output_gm[offset], output_local, len);
75
- }
76
-
77
- if(tail != 0) {
78
- #ifdef ASCEND_310P
79
- for (size_t i = tail; i < elem_per_block; i++) {
80
- output_local[len + i].SetValue(0, 0);
81
- }
82
- SetAtomicAdd<float>();
83
- DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
84
- SetAtomicNone();
85
- #else
86
- DataCopyExtParams dataCopyParams;
87
- dataCopyParams.blockCount = 1;
88
- dataCopyParams.blockLen = tail * sizeof(float);
89
- DataCopyPad(output_gm[offset + len], output_local[len],
90
- dataCopyParams);
91
- #endif
92
- }
93
- output_queue.FreeTensor(output_local);
94
- }
95
-
96
- __aicore__ inline void calculate_row(int64_t idx) {
97
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
98
- const int64_t indices_ne1_idx =
99
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
100
- indices_ne[0];
101
- const int64_t indices_ne0_idx =
102
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
103
- indices_ne1_idx * indices_ne[0]);
104
-
105
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
106
- indices_ne1_idx * indices_stride[1] +
107
- indices_ne2_idx * indices_stride[2];
108
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
109
-
110
- const int64_t input_offset = selected_row_idx * input_stride[1] +
111
- indices_ne1_idx * input_stride[2] +
112
- indices_ne2_idx * input_stride[3];
113
-
114
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
115
- indices_ne1_idx * output_stride[2] +
116
- indices_ne2_idx * output_stride[3];
117
-
118
- copy_in(input_offset, input_ne[0]);
119
- LocalTensor<float> input_local = input_queue.DeQue<float>();
120
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
121
-
122
- DataCopy(output_local, input_local, local_buffer_elems);
123
- output_queue.EnQue(output_local);
124
- copy_out(output_offset, input_ne[0]);
125
-
126
- input_queue.FreeTensor(input_local);
127
- }
128
-
129
- __aicore__ inline void calculate() {
130
- for (int64_t i = ir; i < ir + dr; i++) {
131
- calculate_row(i);
132
- }
133
- }
134
-
135
- private:
136
- int64_t input_ne[4];
137
- size_t input_stride[4];
138
-
139
- int64_t indices_ne[4];
140
- size_t indices_stride[4];
141
-
142
- int64_t output_ne[4];
143
- size_t output_stride[4];
144
-
145
- size_t local_buffer_elems;
146
-
147
- int64_t ir;
148
- int64_t dr;
149
-
150
- TPipe pipe;
151
- GlobalTensor<float> input_gm;
152
- GlobalTensor<int32_t> indices_gm;
153
- GlobalTensor<float> output_gm;
154
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
155
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
156
- int64_t op_block_idx;
157
- };
158
-
159
- template <typename T>
160
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
161
- auto gm_ptr = (__gm__ uint8_t *)gm;
162
- auto ub_ptr = (uint8_t *)(ub);
163
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
164
- *ub_ptr = *gm_ptr;
165
- }
166
- }
167
-
168
- extern "C" __global__ __aicore__ void ascendc_get_row_f32(
169
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
170
- GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
171
- GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
172
- int64_t input_ne_ub[4];
173
- size_t input_nb_ub[4];
174
- int64_t indices_ne_ub[4];
175
- size_t indices_nb_ub[4];
176
- int64_t output_ne_ub[4];
177
- size_t output_nb_ub[4];
178
-
179
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
180
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
181
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
182
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
183
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
184
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
185
-
186
- GET_ROW_F32 op;
187
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
188
- indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
189
- op.calculate();
190
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/get_row_q4_0.cpp DELETED
@@ -1,204 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
- #ifdef ASCEND_310P // 310P not support 4bit get row
6
- extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
7
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
8
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
9
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
10
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
11
- printf("Ascend310P not support 4bit get row.\n");
12
- }
13
- #else
14
-
15
- #define BUFFER_NUM 2
16
-
17
- #define QK4_0 32
18
-
19
- class GET_ROW_Q4_0 {
20
- public:
21
- __aicore__ inline GET_ROW_Q4_0() {}
22
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
23
- int64_t *input_ne_ub, int64_t *indices_ne_ub,
24
- size_t *indices_nb_ub, int64_t *output_ne_ub,
25
- size_t *output_nb_ub) {
26
- int64_t op_block_num = GetBlockNum();
27
- int64_t op_block_idx = GetBlockIdx();
28
-
29
- for (int i = 0; i < 4; i++) {
30
- input_ne[i] = input_ne_ub[i];
31
- indices_ne[i] = indices_ne_ub[i];
32
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
33
- scale_ne[i] = input_ne_ub[i];
34
- output_ne[i] = output_ne_ub[i];
35
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
36
- }
37
-
38
- // one scale for a group.
39
- scale_ne[0] /= QK4_0;
40
-
41
- input_stride[0] = 1;
42
- scale_stride[0] = 1;
43
- output_stride[0] = 1;
44
- for (int i = 1; i < 4; i++) {
45
- input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
46
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
47
- }
48
-
49
- group_size_in_row = input_ne[0] / QK4_0;
50
- int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
51
- input_ne[3] / 2;
52
-
53
- // Indices has two dims. n_elements = all rows should get.
54
- // dr, all rows should this thread get.
55
- uint64_t n_elements =
56
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
57
- dr = n_elements / op_block_num;
58
-
59
- uint64_t tails = n_elements % op_block_num;
60
- if (op_block_idx < tails) {
61
- dr += 1;
62
- ir = dr * op_block_idx;
63
- } else {
64
- ir = dr * op_block_idx + tails;
65
- }
66
-
67
- input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
68
- scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
69
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
70
- output_gm.SetGlobalBuffer((__gm__ float *)output);
71
-
72
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
73
- pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
74
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
75
- }
76
-
77
- __aicore__ inline void copy_in(uint32_t offset) {
78
- LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
79
- // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
80
- DataCopy(input_local, input_gm[offset], QK4_0);
81
- input_queue.EnQue(input_local);
82
- }
83
-
84
- __aicore__ inline void copy_out(uint32_t offset) {
85
- LocalTensor<float> output_local = output_queue.DeQue<float>();
86
- DataCopy(output_gm[offset], output_local, QK4_0);
87
- output_queue.FreeTensor(output_local);
88
- }
89
-
90
- __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
91
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
92
- const int64_t indices_ne1_idx =
93
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
94
- indices_ne[0];
95
- const int64_t indices_ne0_idx =
96
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
97
- indices_ne1_idx * indices_ne[0]);
98
-
99
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
100
- indices_ne1_idx * indices_stride[1] +
101
- indices_ne2_idx * indices_stride[2];
102
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
103
-
104
- const int64_t input_offset = selected_row_idx * input_stride[1] +
105
- indices_ne1_idx * input_stride[2] +
106
- indices_ne2_idx * input_stride[3] +
107
- group * QK4_0;
108
- const int64_t scale_offset = selected_row_idx * scale_stride[1] +
109
- indices_ne1_idx * scale_stride[2] +
110
- indices_ne2_idx * scale_stride[3] + group;
111
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
112
- indices_ne1_idx * output_stride[2] +
113
- indices_ne2_idx * output_stride[3] +
114
- group * QK4_0;
115
-
116
- copy_in(input_offset);
117
- LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
118
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
119
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
120
-
121
- // TODO: cast more data to speed up.
122
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
123
- Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
124
-
125
- // Only mul need compile by group.
126
- half scale = scale_gm.GetValue(scale_offset);
127
-
128
- Muls(output_local, output_local, (float)scale, QK4_0);
129
-
130
- input_queue.FreeTensor(input_local);
131
- cast_queue.FreeTensor(cast_local);
132
- output_queue.EnQue(output_local);
133
-
134
- copy_out(output_offset);
135
- }
136
-
137
- __aicore__ inline void calculate() {
138
- for (int64_t i = ir; i < ir + dr; i++) {
139
- for (int64_t j = 0; j < group_size_in_row; j++) {
140
- calculate_group(i, j);
141
- }
142
- }
143
- }
144
-
145
- private:
146
- int64_t input_ne[4];
147
- size_t input_stride[4];
148
-
149
- int64_t scale_ne[4];
150
- size_t scale_stride[4];
151
-
152
- int64_t indices_ne[4];
153
- size_t indices_stride[4];
154
-
155
- int64_t output_ne[4];
156
- size_t output_stride[4];
157
-
158
- int64_t ir;
159
- int64_t dr;
160
-
161
- int64_t group_size_in_row;
162
-
163
- TPipe pipe;
164
- GlobalTensor<int4b_t> input_gm;
165
- GlobalTensor<half> scale_gm;
166
- GlobalTensor<int32_t> indices_gm;
167
- GlobalTensor<float> output_gm;
168
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
169
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
170
- TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
171
- };
172
-
173
- template <typename T>
174
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
175
- auto gm_ptr = (__gm__ uint8_t *)gm;
176
- auto ub_ptr = (uint8_t *)(ub);
177
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
178
- *ub_ptr = *gm_ptr;
179
- }
180
- }
181
-
182
- extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
183
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
184
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
185
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
186
- int64_t input_ne_ub[4];
187
- int64_t indices_ne_ub[4];
188
- size_t indices_nb_ub[4];
189
- int64_t output_ne_ub[4];
190
- size_t output_nb_ub[4];
191
-
192
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
193
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
194
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
195
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
196
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
197
-
198
- GET_ROW_Q4_0 op;
199
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
200
- indices_nb_ub, output_ne_ub, output_nb_ub);
201
- op.calculate();
202
- }
203
-
204
- #endif // #ifdef ASCEND_310P
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/get_row_q8_0.cpp DELETED
@@ -1,191 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- // optimize me. Use template to avoid copy code.
4
- using namespace AscendC;
5
-
6
- #define BUFFER_NUM 2
7
-
8
- #define QK8_0 32
9
-
10
- class GET_ROW_Q8_0 {
11
- public:
12
- __aicore__ inline GET_ROW_Q8_0() {}
13
- __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
14
- int64_t *input_ne_ub, int64_t *indices_ne_ub,
15
- size_t *indices_nb_ub, int64_t *output_ne_ub,
16
- size_t *output_nb_ub) {
17
- int64_t op_block_num = GetBlockNum();
18
- int64_t op_block_idx = GetBlockIdx();
19
-
20
- for (int i = 0; i < 4; i++) {
21
- input_ne[i] = input_ne_ub[i];
22
- indices_ne[i] = indices_ne_ub[i];
23
- indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
24
- scale_ne[i] = input_ne_ub[i];
25
- output_ne[i] = output_ne_ub[i];
26
- output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
27
- }
28
-
29
- // one scale for a group.
30
- scale_ne[0] /= QK8_0;
31
-
32
- input_stride[0] = 1;
33
- scale_stride[0] = 1;
34
- output_stride[0] = 1;
35
- for (int i = 1; i < 4; i++) {
36
- input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
37
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
38
- }
39
-
40
- group_size_in_row = input_ne[0] / QK8_0;
41
- int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
42
- input_ne[3] * sizeof(int8_t);
43
-
44
- // Indices has two dims. n_elements = all rows should get.
45
- // dr, all rows should this thread get.
46
- uint64_t n_elements =
47
- indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
48
- dr = n_elements / op_block_num;
49
-
50
- uint64_t tails = n_elements % op_block_num;
51
- if (op_block_idx < tails) {
52
- dr += 1;
53
- ir = dr * op_block_idx;
54
- } else {
55
- ir = dr * op_block_idx + tails;
56
- }
57
-
58
- input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
59
- scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
60
- indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
61
- output_gm.SetGlobalBuffer((__gm__ float *)output);
62
-
63
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
64
- pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
65
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
66
- }
67
-
68
- __aicore__ inline void copy_in(uint32_t offset) {
69
- LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
70
- DataCopy(input_local, input_gm[offset], QK8_0);
71
- input_queue.EnQue(input_local);
72
- }
73
-
74
- __aicore__ inline void copy_out(uint32_t offset) {
75
- LocalTensor<float> output_local = output_queue.DeQue<float>();
76
- DataCopy(output_gm[offset], output_local, QK8_0);
77
- output_queue.FreeTensor(output_local);
78
- }
79
-
80
- __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
81
- const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
82
- const int64_t indices_ne1_idx =
83
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
84
- indices_ne[0];
85
- const int64_t indices_ne0_idx =
86
- (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
87
- indices_ne1_idx * indices_ne[0]);
88
-
89
- const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
90
- indices_ne1_idx * indices_stride[1] +
91
- indices_ne2_idx * indices_stride[2];
92
- const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
93
-
94
- const int64_t input_offset = selected_row_idx * input_stride[1] +
95
- indices_ne1_idx * input_stride[2] +
96
- indices_ne2_idx * input_stride[3] +
97
- group * QK8_0;
98
- const int64_t scale_offset = selected_row_idx * scale_stride[1] +
99
- indices_ne1_idx * scale_stride[2] +
100
- indices_ne2_idx * scale_stride[3] + group;
101
- const int64_t output_offset = indices_ne0_idx * output_stride[1] +
102
- indices_ne1_idx * output_stride[2] +
103
- indices_ne2_idx * output_stride[3] +
104
- group * QK8_0;
105
-
106
- copy_in(input_offset);
107
- LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
108
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
109
- LocalTensor<float> output_local = output_queue.AllocTensor<float>();
110
-
111
- // TODO: cast more data to speed up.
112
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
113
- Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
114
-
115
- // Only mul need compile by group.
116
- half scale = scale_gm.GetValue(scale_offset);
117
- Muls(output_local, output_local, (float)scale, QK8_0);
118
-
119
- input_queue.FreeTensor(input_local);
120
- cast_queue.FreeTensor(cast_local);
121
- output_queue.EnQue(output_local);
122
-
123
- copy_out(output_offset);
124
- }
125
-
126
- __aicore__ inline void calculate() {
127
- for (int64_t i = ir; i < ir + dr; i++) {
128
- for (int64_t j = 0; j < group_size_in_row; j++) {
129
- calculate_group(i, j);
130
- }
131
- }
132
- }
133
-
134
- private:
135
- int64_t input_ne[4];
136
- size_t input_stride[4];
137
-
138
- int64_t scale_ne[4];
139
- size_t scale_stride[4];
140
-
141
- int64_t indices_ne[4];
142
- size_t indices_stride[4];
143
-
144
- int64_t output_ne[4];
145
- size_t output_stride[4];
146
-
147
- int64_t ir;
148
- int64_t dr;
149
-
150
- int64_t group_size_in_row;
151
-
152
- TPipe pipe;
153
- GlobalTensor<int8_t> input_gm;
154
- GlobalTensor<half> scale_gm;
155
- GlobalTensor<int32_t> indices_gm;
156
- GlobalTensor<float> output_gm;
157
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
158
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
159
- TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
160
- };
161
-
162
- template <typename T>
163
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
164
- auto gm_ptr = (__gm__ uint8_t *)gm;
165
- auto ub_ptr = (uint8_t *)(ub);
166
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
167
- *ub_ptr = *gm_ptr;
168
- }
169
- }
170
-
171
- extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
172
- GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
173
- GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
174
- GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
175
- int64_t input_ne_ub[4];
176
- int64_t indices_ne_ub[4];
177
- size_t indices_nb_ub[4];
178
- int64_t output_ne_ub[4];
179
- size_t output_nb_ub[4];
180
-
181
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
182
- copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
183
- copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
184
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
185
- copy_to_ub(output_nb_gm, output_nb_ub, 32);
186
-
187
- GET_ROW_Q8_0 op;
188
- op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
189
- indices_nb_ub, output_ne_ub, output_nb_ub);
190
- op.calculate();
191
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp DELETED
@@ -1,218 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- using namespace AscendC;
4
- #ifdef ASCEND_310P
5
- extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
6
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
7
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
8
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
9
- printf("Ascend310P not support f16->8bit quantization.\n");
10
- }
11
- #else
12
-
13
- #define BUFFER_NUM 2
14
- #define QK8_0 32
15
-
16
- class QUANTIZE_F16_Q8_0 {
17
- public:
18
- __aicore__ inline QUANTIZE_F16_Q8_0() {}
19
- __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
20
- int64_t *input_ne_ub, size_t *input_nb_ub,
21
- int64_t *output_ne_ub) {
22
- int64_t op_block_num = GetBlockNum();
23
- int64_t op_block_idx = GetBlockIdx();
24
-
25
- for (int i = 0; i < 4; i++) {
26
- input_ne[i] = input_ne_ub[i];
27
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
28
-
29
- output_ne[i] = output_ne_ub[i];
30
- }
31
-
32
- output_stride[0] = 1;
33
- for (int i = 1; i < 4; i++) {
34
- output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
35
- }
36
-
37
- scale_ne = input_ne;
38
- scale_stride[0] = 1;
39
- scale_stride[1] = input_ne[0] / QK8_0;
40
- for (int i = 2; i < 4; i++) {
41
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
42
- }
43
-
44
- // split input tensor by rows.
45
- uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
46
- dr = nr / op_block_num;
47
-
48
- uint64_t tails = nr % op_block_num;
49
- if (op_block_idx < tails) {
50
- dr += 1;
51
- ir = dr * op_block_idx;
52
- } else {
53
- ir = dr * op_block_idx + tails;
54
- }
55
-
56
- group_size_in_row = scale_stride[1];
57
- int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
58
- output_ne[3] * sizeof(uint8_t);
59
-
60
- input_gm.SetGlobalBuffer((__gm__ half *)input);
61
- output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
62
- scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
63
- group_size_in_row *
64
- sizeof(half)));
65
-
66
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
67
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
68
- pipe.InitBuffer(work_queue, 1, 32);
69
- pipe.InitBuffer(max_queue, 1, 32);
70
- pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
71
- pipe.InitBuffer(scale_queue, 1, 32);
72
- pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
73
- }
74
-
75
- __aicore__ inline void copy_in(uint32_t offset) {
76
- LocalTensor<half> input_local = input_queue.AllocTensor<half>();
77
- DataCopy(input_local, input_gm[offset], QK8_0);
78
- input_queue.EnQue(input_local);
79
- }
80
-
81
- __aicore__ inline void copy_out(uint32_t offset) {
82
- LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
83
- DataCopy(output_gm[offset], output_local, QK8_0);
84
- output_queue.FreeTensor(output_local);
85
- }
86
-
87
- __aicore__ inline half calculate_group(int64_t row, int64_t group) {
88
- const int64_t i3 = row / (input_ne[1] * input_ne[2]);
89
- const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
90
- const int64_t i1 =
91
- row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
92
-
93
- const int64_t input_offset = i1 * input_stride[1] +
94
- i2 * input_stride[2] +
95
- i3 * input_stride[3] + QK8_0 * group;
96
-
97
- const int64_t output_offset = i1 * output_stride[1] +
98
- i2 * output_stride[2] +
99
- i3 * output_stride[3] + QK8_0 * group;
100
-
101
- copy_in(input_offset);
102
- LocalTensor<half> input_local = input_queue.DeQue<half>();
103
- LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
104
- LocalTensor<float> work_local = work_queue.AllocTensor<float>();
105
- LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
106
- LocalTensor<float> max_local = max_queue.AllocTensor<float>();
107
- LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
108
-
109
- Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
110
- Abs(abs_local, cast_local, QK8_0);
111
- ReduceMax(max_local, abs_local, work_local, QK8_0);
112
-
113
- pipe_barrier(PIPE_ALL);
114
- float d = max_local.GetValue(0);
115
- d = d / ((1 << 7) - 1);
116
- if (d != 0) {
117
- Muls(cast_local, cast_local, 1.0f / d, QK8_0);
118
- }
119
-
120
- Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
121
- Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
122
- Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
123
- output_queue.EnQue(output_local);
124
- copy_out(output_offset);
125
-
126
- input_queue.FreeTensor(input_local);
127
- work_queue.FreeTensor(work_local);
128
- abs_queue.FreeTensor(abs_local);
129
- max_queue.FreeTensor(max_local);
130
- cast_queue.FreeTensor(cast_local);
131
- return (half)d;
132
- }
133
-
134
- __aicore__ inline void calculate() {
135
- LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
136
- uint32_t scale_local_offset = 0;
137
- uint32_t scale_global_offset = 0;
138
- for (int64_t i = ir; i < ir + dr; i++) {
139
- for (int64_t j = 0; j < group_size_in_row; j++) {
140
- half scale = calculate_group(i, j);
141
- scale_local.SetValue(scale_local_offset++, scale);
142
- if (scale_local_offset == 16) {
143
- scale_local_offset = 0;
144
- // TODO: OPTIMIZE ME
145
- pipe_barrier(PIPE_ALL);
146
- DataCopy(scale_gm[scale_global_offset], scale_local, 16);
147
- pipe_barrier(PIPE_ALL);
148
- scale_global_offset += 16;
149
- }
150
- }
151
- }
152
-
153
- if (scale_local_offset != 0) {
154
- pipe_barrier(PIPE_ALL);
155
- DataCopyExtParams dataCopyParams;
156
- dataCopyParams.blockCount = 1;
157
- dataCopyParams.blockLen = scale_local_offset * sizeof(half);
158
- DataCopyPad(scale_gm[scale_global_offset], scale_local,
159
- dataCopyParams);
160
- pipe_barrier(PIPE_ALL);
161
- }
162
- }
163
-
164
- private:
165
- int64_t input_ne[4];
166
- size_t input_stride[4];
167
-
168
- int64_t *scale_ne;
169
- size_t scale_stride[4];
170
-
171
- int64_t output_ne[4];
172
- size_t output_stride[4];
173
-
174
- int64_t group_size_in_row;
175
-
176
- int64_t ir;
177
- int64_t dr;
178
-
179
- TPipe pipe;
180
- GlobalTensor<half> input_gm;
181
- GlobalTensor<half> scale_gm;
182
- GlobalTensor<int8_t> output_gm;
183
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
184
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
185
- TQue<QuePosition::VECIN, 1> work_queue;
186
- TQue<QuePosition::VECOUT, 1> max_queue;
187
- TQue<QuePosition::VECIN, 1> abs_queue;
188
- TQue<QuePosition::VECOUT, 1> scale_queue;
189
- TQue<QuePosition::VECOUT, 1> cast_queue;
190
-
191
- };
192
-
193
- template <typename T>
194
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
195
- auto gm_ptr = (__gm__ uint8_t *)gm;
196
- auto ub_ptr = (uint8_t *)(ub);
197
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
198
- *ub_ptr = *gm_ptr;
199
- }
200
- }
201
-
202
- extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
203
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
204
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
205
- int64_t input_ne_ub[4];
206
- size_t input_nb_ub[4];
207
- int64_t output_ne_ub[4];
208
-
209
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
210
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
211
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
212
-
213
- QUANTIZE_F16_Q8_0 op;
214
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
215
- op.calculate();
216
- }
217
-
218
- #endif // #ifdef ASCEND_310P
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp DELETED
@@ -1,216 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- using namespace AscendC;
4
- #ifdef ASCEND_310P // 310P not support f32->8bit quantization
5
- extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
6
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
7
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
8
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
9
- printf("Ascend310P not support f32->8bit quantization.\n");
10
- }
11
- #else
12
-
13
- #define BUFFER_NUM 2
14
- #define QK8_0 32
15
-
16
- class QUANTIZE_F32_Q8_0 {
17
- public:
18
- __aicore__ inline QUANTIZE_F32_Q8_0() {}
19
- __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
20
- int64_t *input_ne_ub, size_t *input_nb_ub,
21
- int64_t *output_ne_ub) {
22
- int64_t op_block_num = GetBlockNum();
23
- int64_t op_block_idx = GetBlockIdx();
24
-
25
- for (int i = 0; i < 4; i++) {
26
- input_ne[i] = input_ne_ub[i];
27
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
28
-
29
- output_ne[i] = output_ne_ub[i];
30
- }
31
-
32
- output_stride[0] = 1;
33
- for (int i = 1; i < 4; i++) {
34
- output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
35
- }
36
-
37
- scale_ne = input_ne;
38
- scale_stride[0] = 1;
39
- scale_stride[1] = input_ne[0] / QK8_0;
40
- for (int i = 2; i < 4; i++) {
41
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
42
- }
43
-
44
- // split input tensor by rows.
45
- uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
46
- dr = nr / op_block_num;
47
-
48
- uint64_t tails = nr % op_block_num;
49
- if (op_block_idx < tails) {
50
- dr += 1;
51
- ir = dr * op_block_idx;
52
- } else {
53
- ir = dr * op_block_idx + tails;
54
- }
55
-
56
- group_size_in_row = scale_stride[1];
57
- int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
58
- output_ne[3] * sizeof(uint8_t);
59
-
60
- input_gm.SetGlobalBuffer((__gm__ float *)input);
61
- output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
62
- scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
63
- ir * group_size_in_row *
64
- sizeof(half)));
65
-
66
- pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
67
- pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
68
- pipe.InitBuffer(work_queue, 1, 32);
69
- pipe.InitBuffer(max_queue, 1, 32);
70
- pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
71
- pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
72
- pipe.InitBuffer(scale_queue, 1, 32);
73
- }
74
-
75
- __aicore__ inline void copy_in(uint32_t offset) {
76
- LocalTensor<float> input_local = input_queue.AllocTensor<float>();
77
- DataCopy(input_local, input_gm[offset], QK8_0);
78
- input_queue.EnQue(input_local);
79
- }
80
-
81
- __aicore__ inline void copy_out(uint32_t offset) {
82
- LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
83
- DataCopy(output_gm[offset], output_local, QK8_0);
84
- output_queue.FreeTensor(output_local);
85
- }
86
-
87
- __aicore__ inline half calculate_group(int64_t row, int64_t group) {
88
- const int64_t i3 = row / (input_ne[1] * input_ne[2]);
89
- const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
90
- const int64_t i1 =
91
- row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
92
-
93
- const int64_t input_offset = i1 * input_stride[1] +
94
- i2 * input_stride[2] +
95
- i3 * input_stride[3] + QK8_0 * group;
96
-
97
- const int64_t output_offset = i1 * output_stride[1] +
98
- i2 * output_stride[2] +
99
- i3 * output_stride[3] + QK8_0 * group;
100
-
101
- copy_in(input_offset);
102
- LocalTensor<float> input_local = input_queue.DeQue<float>();
103
- LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
104
- LocalTensor<float> work_local = work_queue.AllocTensor<float>();
105
- LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
106
- LocalTensor<float> max_local = max_queue.AllocTensor<float>();
107
- LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
108
-
109
- Abs(abs_local, input_local, QK8_0);
110
- ReduceMax(max_local, abs_local, work_local, QK8_0);
111
- pipe_barrier(PIPE_ALL);
112
- float d = max_local.GetValue(0);
113
- d = d / ((1 << 7) - 1);
114
- if (d != 0) {
115
- Muls(input_local, input_local, 1.0f / d, QK8_0);
116
- }
117
-
118
- Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
119
- Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
120
- Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
121
- output_queue.EnQue(output_local);
122
- copy_out(output_offset);
123
-
124
- input_queue.FreeTensor(input_local);
125
- work_queue.FreeTensor(work_local);
126
- abs_queue.FreeTensor(abs_local);
127
- max_queue.FreeTensor(max_local);
128
- cast_queue.FreeTensor(cast_local);
129
-
130
- return (half)d;
131
- }
132
-
133
- __aicore__ inline void calculate() {
134
- LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
135
- uint32_t scale_local_offset = 0;
136
- uint32_t scale_global_offset = 0;
137
- for (int64_t i = ir; i < ir + dr; i++) {
138
- for (int64_t j = 0; j < group_size_in_row; j++) {
139
- half scale = calculate_group(i, j);
140
- scale_local.SetValue(scale_local_offset++, scale);
141
- if (scale_local_offset == 16) {
142
- scale_local_offset = 0;
143
- // TODO: OPTIMIZE ME
144
- pipe_barrier(PIPE_ALL);
145
- DataCopy(scale_gm[scale_global_offset], scale_local, 16);
146
- pipe_barrier(PIPE_ALL);
147
- scale_global_offset += 16;
148
- }
149
- }
150
- }
151
-
152
- if (scale_local_offset != 0) {
153
- pipe_barrier(PIPE_ALL);
154
- DataCopyExtParams dataCopyParams;
155
- dataCopyParams.blockCount = 1;
156
- dataCopyParams.blockLen = scale_local_offset * sizeof(half);
157
- DataCopyPad(scale_gm[scale_global_offset], scale_local,
158
- dataCopyParams);
159
- pipe_barrier(PIPE_ALL);
160
- }
161
- }
162
-
163
- private:
164
- int64_t input_ne[4];
165
- size_t input_stride[4];
166
-
167
- int64_t *scale_ne;
168
- size_t scale_stride[4];
169
-
170
- int64_t output_ne[4];
171
- size_t output_stride[4];
172
-
173
- int64_t group_size_in_row;
174
-
175
- int64_t ir;
176
- int64_t dr;
177
-
178
- TPipe pipe;
179
- GlobalTensor<float> input_gm;
180
- GlobalTensor<half> scale_gm;
181
- GlobalTensor<int8_t> output_gm;
182
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
183
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
184
- TQue<QuePosition::VECIN, 1> work_queue;
185
- TQue<QuePosition::VECOUT, 1> max_queue;
186
- TQue<QuePosition::VECIN, 1> abs_queue;
187
- TQue<QuePosition::VECIN, 1> cast_queue;
188
- TQue<QuePosition::VECOUT, 1> scale_queue;
189
- };
190
-
191
- template <typename T>
192
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
193
- auto gm_ptr = (__gm__ uint8_t *)gm;
194
- auto ub_ptr = (uint8_t *)(ub);
195
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
196
- *ub_ptr = *gm_ptr;
197
- }
198
- }
199
-
200
- extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
201
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
202
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
203
- int64_t input_ne_ub[4];
204
- size_t input_nb_ub[4];
205
- int64_t output_ne_ub[4];
206
-
207
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
208
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
209
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
210
-
211
- QUANTIZE_F32_Q8_0 op;
212
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
213
- op.calculate();
214
- }
215
-
216
- #endif // #ifdef ASCEND_310P
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp DELETED
@@ -1,295 +0,0 @@
1
- #include "kernel_operator.h"
2
-
3
- using namespace AscendC;
4
- #ifdef ASCEND_310P // 310P not support float->4bit quantization
5
- extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
6
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
7
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
8
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
9
- printf("Ascend310P not support f32->4bit quantization.\n");
10
- }
11
-
12
- extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
13
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
14
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
15
- // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
16
- printf("Ascend310P not support f16->4bit quantization.\n");
17
- }
18
- #else
19
-
20
- #define BUFFER_NUM 2
21
- #define Group_Size 32
22
-
23
- template <typename SRC_T>
24
- class QUANTIZE_FLOAT_TO_Q4_0 {
25
- public:
26
- __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
27
- __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
28
- int64_t *input_ne_ub, size_t *input_nb_ub,
29
- int64_t *output_ne_ub) {
30
- // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
31
- // permute=[0,0,0,0]):
32
- // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
33
- int64_t op_block_num = GetBlockNum();
34
- int64_t op_block_idx = GetBlockIdx();
35
-
36
- // input stride of data elements
37
- for (int i = 0; i < 4; i++) {
38
- input_ne[i] = input_ne_ub[i];
39
- input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
40
- output_ne[i] = output_ne_ub[i];
41
- }
42
-
43
- // output stride of data elements
44
- output_stride[0] = 1;
45
- for (int i = 1; i < 4; i++) {
46
- output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
47
- }
48
-
49
- // scale saved one by one after data:. [group1_scale, group2_scale, ...]
50
- scale_ne = input_ne;
51
- scale_stride[0] = 1;
52
- scale_stride[1] = input_ne[0] / Group_Size;
53
- for (int i = 2; i < 4; i++) {
54
- scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
55
- }
56
-
57
- // split input tensor by rows.
58
- uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
59
- dr = nr / op_block_num;
60
-
61
- uint64_t tails = nr % op_block_num;
62
- if (op_block_idx < tails) {
63
- dr += 1;
64
- ir = dr * op_block_idx;
65
- } else {
66
- ir = dr * op_block_idx + tails;
67
- }
68
-
69
- group_size_in_row = scale_stride[1];
70
- int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
71
- output_ne[3] * sizeof(uint8_t) / 2;
72
-
73
- input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
74
- output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
75
- scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
76
- group_size_in_row *
77
- sizeof(half)));
78
-
79
- pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
80
- pipe.InitBuffer(output_queue, BUFFER_NUM,
81
- Group_Size * sizeof(int8_t) / 2);
82
- pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
83
- pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
84
- pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
85
- pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
86
- pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
87
- pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
88
- pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
89
- }
90
-
91
- __aicore__ inline void copy_in(uint32_t offset) {
92
- LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
93
- DataCopy(input_local, input_gm[offset], Group_Size);
94
- input_queue.EnQue(input_local);
95
- }
96
-
97
- __aicore__ inline void copy_out(uint32_t offset) {
98
- // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
99
- // and using DataCopyPad to avoid 32 bits align.
100
- LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
101
- LocalTensor<int8_t> output_int8_local =
102
- output_local.ReinterpretCast<int8_t>();
103
-
104
- DataCopyExtParams dataCopyParams;
105
- dataCopyParams.blockCount = 1;
106
- dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t);
107
- DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
108
-
109
- output_queue.FreeTensor(output_local);
110
- }
111
-
112
- __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
113
- LocalTensor<float> input_local) {
114
- DataCopy(cast_local, input_local, Group_Size);
115
- }
116
-
117
- __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
118
- LocalTensor<half> input_local) {
119
- Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
120
- }
121
-
122
- __aicore__ inline half calculate_group(int64_t row, int64_t group) {
123
- const int64_t i3 = row / (input_ne[1] * input_ne[2]);
124
- const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
125
- const int64_t i1 =
126
- row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
127
-
128
- const int64_t input_offset = i1 * input_stride[1] +
129
- i2 * input_stride[2] +
130
- i3 * input_stride[3] + Group_Size * group;
131
-
132
- // output_offset is stride for output_gm which datatype is int8_t and
133
- // divided by 2 is needed for int4b_t.
134
- const int64_t output_offset = (i1 * output_stride[1] +
135
- i2 * output_stride[2] +
136
- i3 * output_stride[3] +
137
- Group_Size * group) / 2;
138
- copy_in(input_offset);
139
-
140
- LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
141
- LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
142
- LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
143
- LocalTensor<float> work_local = work_queue.AllocTensor<float>();
144
- LocalTensor<float> max_local = max_queue.AllocTensor<float>();
145
- LocalTensor<float> min_local = min_queue.AllocTensor<float>();
146
- LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
147
- LocalTensor<half> half_local = half_queue.AllocTensor<half>();
148
-
149
- input_to_cast(cast_local, input_local);
150
-
151
- ReduceMax(max_local, cast_local, work_local, Group_Size);
152
- ReduceMin(min_local, cast_local, work_local, Group_Size);
153
- const float max_value = max_local.GetValue(0);
154
- const float min_value = min_local.GetValue(0);
155
- float d = max_value;
156
- if (min_value < 0 && (-1 * min_value) > max_value) {
157
- d = min_value;
158
- }
159
-
160
- d = d / (-8);
161
- if (d != 0) {
162
- Muls(cast_local, cast_local, 1.0f / d, Group_Size);
163
- }
164
-
165
- // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
166
- float scalar = 8.5f;
167
- Adds(cast_local, cast_local, scalar, Group_Size);
168
- Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
169
- scalar = 15.0f;
170
- Mins(cast_local, cast_local, scalar, Group_Size);
171
- scalar = -8.0f;
172
- Adds(cast_local, cast_local, scalar, Group_Size);
173
-
174
- // float->half->int4b
175
- Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
176
- Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
177
-
178
- output_queue.EnQue(output_local);
179
- copy_out(output_offset);
180
-
181
- input_queue.FreeTensor(input_local);
182
- work_queue.FreeTensor(work_local);
183
- max_queue.FreeTensor(max_local);
184
- min_queue.FreeTensor(min_local);
185
- int8_queue.FreeTensor(int8_local);
186
- half_queue.FreeTensor(half_local);
187
- cast_queue.FreeTensor(cast_local);
188
- return (half)d;
189
- }
190
-
191
- __aicore__ inline void calculate() {
192
- LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
193
- uint32_t scale_local_offset = 0;
194
- uint32_t scale_global_offset = 0;
195
- for (int64_t i = ir; i < ir + dr; i++) {
196
- for (int64_t j = 0; j < group_size_in_row; j++) {
197
- half scale = calculate_group(i, j);
198
- scale_local.SetValue(scale_local_offset++, scale);
199
- // Copy Group_Size/2 length data each time.
200
- if (scale_local_offset == Group_Size / 2) {
201
- scale_local_offset = 0;
202
- // TODO: OPTIMIZE ME
203
- pipe_barrier(PIPE_ALL);
204
- DataCopy(scale_gm[scale_global_offset], scale_local,
205
- Group_Size / 2);
206
- pipe_barrier(PIPE_ALL);
207
- scale_global_offset += Group_Size / 2;
208
- }
209
- }
210
- }
211
-
212
- if (scale_local_offset != 0) {
213
- pipe_barrier(PIPE_ALL);
214
- DataCopyExtParams dataCopyParams;
215
- dataCopyParams.blockCount = 1;
216
- dataCopyParams.blockLen = scale_local_offset * sizeof(half);
217
- DataCopyPad(scale_gm[scale_global_offset], scale_local,
218
- dataCopyParams);
219
- pipe_barrier(PIPE_ALL);
220
- }
221
- scale_queue.FreeTensor(scale_local);
222
- }
223
-
224
- private:
225
- int64_t input_ne[4];
226
- size_t input_stride[4];
227
-
228
- int64_t *scale_ne;
229
- size_t scale_stride[4];
230
-
231
- int64_t output_ne[4];
232
- size_t output_stride[4];
233
-
234
- int64_t group_size_in_row;
235
-
236
- int64_t ir;
237
- int64_t dr;
238
-
239
- TPipe pipe;
240
- GlobalTensor<SRC_T> input_gm;
241
- GlobalTensor<half> scale_gm;
242
- GlobalTensor<int8_t> output_gm;
243
- TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
244
- TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
245
- TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
246
- TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
247
- TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
248
- TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
249
- TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
250
- TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
251
- TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
252
- };
253
-
254
- template <typename T>
255
- __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
256
- auto gm_ptr = (__gm__ uint8_t *)gm;
257
- auto ub_ptr = (uint8_t *)(ub);
258
- for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
259
- *ub_ptr = *gm_ptr;
260
- }
261
- }
262
-
263
- extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
264
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
265
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
266
- int64_t input_ne_ub[4];
267
- size_t input_nb_ub[4];
268
- int64_t output_ne_ub[4];
269
-
270
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
271
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
272
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
273
-
274
- QUANTIZE_FLOAT_TO_Q4_0<half> op;
275
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
276
- op.calculate();
277
- }
278
-
279
- extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
280
- GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
281
- GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
282
- int64_t input_ne_ub[4];
283
- size_t input_nb_ub[4];
284
- int64_t output_ne_ub[4];
285
-
286
- copy_to_ub(input_ne_gm, input_ne_ub, 32);
287
- copy_to_ub(input_nb_gm, input_nb_ub, 32);
288
- copy_to_ub(output_ne_gm, output_ne_ub, 32);
289
-
290
- QUANTIZE_FLOAT_TO_Q4_0<float> op;
291
- op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
292
- op.calculate();
293
- }
294
-
295
- #endif // #ifdef ASCEND_310P
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/CMakeLists.txt DELETED
@@ -1,166 +0,0 @@
1
-
2
- find_package(Vulkan COMPONENTS glslc REQUIRED)
3
- find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
4
-
5
- if (NOT glslc_executable)
6
- message(FATAL_ERROR "glslc not found")
7
- endif()
8
-
9
- ggml_add_backend_library(ggml-kompute
10
- ggml-kompute.cpp
11
- ../../include/ggml-kompute.h
12
- )
13
-
14
- target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
15
- target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
16
-
17
- add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
18
-
19
- function(compile_shader)
20
- set(options)
21
- set(oneValueArgs)
22
- set(multiValueArgs SOURCES)
23
- cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
24
- foreach(source ${compile_shader_SOURCES})
25
- get_filename_component(filename ${source} NAME)
26
- set(spv_file ${filename}.spv)
27
- add_custom_command(
28
- OUTPUT ${spv_file}
29
- DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
30
- ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
31
- ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
32
- ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
33
- ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
34
- COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
35
- COMMENT "Compiling ${source} to ${spv_file}"
36
- )
37
-
38
- get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
39
- set(FILE_NAME "shader${RAW_FILE_NAME}")
40
- string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
41
- string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
42
- string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
43
- set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
44
- message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
45
- if(CMAKE_GENERATOR MATCHES "Visual Studio")
46
- add_custom_command(
47
- OUTPUT ${OUTPUT_HEADER_FILE}
48
- COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
49
- COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
50
- COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
51
- COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
52
- COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
53
- COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
54
- COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
55
- COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
56
- DEPENDS ${spv_file} xxd
57
- COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
58
- )
59
- else()
60
- add_custom_command(
61
- OUTPUT ${OUTPUT_HEADER_FILE}
62
- COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
63
- COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
64
- COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
65
- COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
66
- COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
67
- COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
68
- COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
69
- COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
70
- DEPENDS ${spv_file} xxd
71
- COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
72
- )
73
- endif()
74
- endforeach()
75
- endfunction()
76
-
77
- if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
78
- message(STATUS "Kompute found")
79
- set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
80
- add_subdirectory(kompute)
81
-
82
- # Compile our shaders
83
- compile_shader(SOURCES
84
- kompute-shaders/op_scale.comp
85
- kompute-shaders/op_scale_8.comp
86
- kompute-shaders/op_add.comp
87
- kompute-shaders/op_addrow.comp
88
- kompute-shaders/op_mul.comp
89
- kompute-shaders/op_silu.comp
90
- kompute-shaders/op_relu.comp
91
- kompute-shaders/op_gelu.comp
92
- kompute-shaders/op_softmax.comp
93
- kompute-shaders/op_norm.comp
94
- kompute-shaders/op_rmsnorm.comp
95
- kompute-shaders/op_diagmask.comp
96
- kompute-shaders/op_mul_mat_mat_f32.comp
97
- kompute-shaders/op_mul_mat_f16.comp
98
- kompute-shaders/op_mul_mat_q8_0.comp
99
- kompute-shaders/op_mul_mat_q4_0.comp
100
- kompute-shaders/op_mul_mat_q4_1.comp
101
- kompute-shaders/op_mul_mat_q4_k.comp
102
- kompute-shaders/op_mul_mat_q6_k.comp
103
- kompute-shaders/op_getrows_f32.comp
104
- kompute-shaders/op_getrows_f16.comp
105
- kompute-shaders/op_getrows_q4_0.comp
106
- kompute-shaders/op_getrows_q4_1.comp
107
- kompute-shaders/op_getrows_q6_k.comp
108
- kompute-shaders/op_rope_norm_f16.comp
109
- kompute-shaders/op_rope_norm_f32.comp
110
- kompute-shaders/op_rope_neox_f16.comp
111
- kompute-shaders/op_rope_neox_f32.comp
112
- kompute-shaders/op_cpy_f16_f16.comp
113
- kompute-shaders/op_cpy_f16_f32.comp
114
- kompute-shaders/op_cpy_f32_f16.comp
115
- kompute-shaders/op_cpy_f32_f32.comp
116
- )
117
-
118
- # Create a custom target for our generated shaders
119
- add_custom_target(generated_shaders DEPENDS
120
- shaderop_scale.h
121
- shaderop_scale_8.h
122
- shaderop_add.h
123
- shaderop_addrow.h
124
- shaderop_mul.h
125
- shaderop_silu.h
126
- shaderop_relu.h
127
- shaderop_gelu.h
128
- shaderop_softmax.h
129
- shaderop_norm.h
130
- shaderop_rmsnorm.h
131
- shaderop_diagmask.h
132
- shaderop_mul_mat_mat_f32.h
133
- shaderop_mul_mat_f16.h
134
- shaderop_mul_mat_q8_0.h
135
- shaderop_mul_mat_q4_0.h
136
- shaderop_mul_mat_q4_1.h
137
- shaderop_mul_mat_q4_k.h
138
- shaderop_mul_mat_q6_k.h
139
- shaderop_getrows_f32.h
140
- shaderop_getrows_f16.h
141
- shaderop_getrows_q4_0.h
142
- shaderop_getrows_q4_1.h
143
- shaderop_getrows_q6_k.h
144
- shaderop_rope_norm_f16.h
145
- shaderop_rope_norm_f32.h
146
- shaderop_rope_neox_f16.h
147
- shaderop_rope_neox_f32.h
148
- shaderop_cpy_f16_f16.h
149
- shaderop_cpy_f16_f32.h
150
- shaderop_cpy_f32_f16.h
151
- shaderop_cpy_f32_f32.h
152
- )
153
-
154
- # Create a custom command that depends on the generated_shaders
155
- add_custom_command(
156
- OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
157
- COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
158
- DEPENDS generated_shaders
159
- COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
160
- )
161
-
162
- # Add the stamp to the main sources to ensure dependency tracking
163
- target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
164
- else()
165
- message(WARNING "Kompute not found")
166
- endif()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/ggml-kompute.cpp DELETED
@@ -1,2251 +0,0 @@
1
- #include "ggml-impl.h"
2
- #include "ggml-backend.h"
3
- #include "ggml-backend-impl.h"
4
- #include "ggml-kompute.h"
5
-
6
- // These are generated at build time by cmake custom command
7
- #include "shaderop_scale.h"
8
- #include "shaderop_scale_8.h"
9
- #include "shaderop_add.h"
10
- #include "shaderop_addrow.h"
11
- #include "shaderop_mul.h"
12
- #include "shaderop_silu.h"
13
- #include "shaderop_relu.h"
14
- #include "shaderop_gelu.h"
15
- #include "shaderop_softmax.h"
16
- #include "shaderop_norm.h"
17
- #include "shaderop_rmsnorm.h"
18
- #include "shaderop_diagmask.h"
19
- #include "shaderop_mul_mat_f16.h"
20
- #include "shaderop_mul_mat_q8_0.h"
21
- #include "shaderop_mul_mat_q4_0.h"
22
- #include "shaderop_mul_mat_q4_1.h"
23
- #include "shaderop_mul_mat_q4_k.h"
24
- #include "shaderop_mul_mat_q6_k.h"
25
- #include "shaderop_mul_mat_mat_f32.h"
26
- #include "shaderop_getrows_f32.h"
27
- #include "shaderop_getrows_f16.h"
28
- #include "shaderop_getrows_q4_0.h"
29
- #include "shaderop_getrows_q4_1.h"
30
- #include "shaderop_getrows_q6_k.h"
31
- #include "shaderop_rope_norm_f16.h"
32
- #include "shaderop_rope_norm_f32.h"
33
- #include "shaderop_rope_neox_f16.h"
34
- #include "shaderop_rope_neox_f32.h"
35
- #include "shaderop_cpy_f16_f16.h"
36
- #include "shaderop_cpy_f16_f32.h"
37
- #include "shaderop_cpy_f32_f16.h"
38
- #include "shaderop_cpy_f32_f32.h"
39
-
40
- #include <algorithm>
41
- #include <array>
42
- #include <cassert>
43
- #include <cstdint>
44
- #include <cstdio>
45
- #include <cstring>
46
- #include <iostream>
47
- #include <memory>
48
- #include <mutex>
49
- #include <stdexcept>
50
- #include <string>
51
- #include <unordered_map>
52
- #include <utility>
53
- #include <vector>
54
-
55
- #include <kompute/Kompute.hpp>
56
- #include <vulkan/vulkan.hpp>
57
-
58
- #ifdef __linux__
59
- #include <cstdlib> // for setenv
60
- #endif
61
-
62
- #define QK4_0 32
63
- #define QR4_0 2
64
- #define QK4_1 32
65
- #define QK_NL 16
66
-
67
- typedef ggml_fp16_t half;
68
-
69
- static std::string ggml_kompute_format_name(int device) {
70
- return "Kompute" + std::to_string(device);
71
- }
72
-
73
- struct ggml_kompute_context {
74
- int device;
75
- std::string name;
76
- std::shared_ptr<vk::DescriptorPool> pool;
77
-
78
- ggml_kompute_context(int device)
79
- : device(device), name(ggml_kompute_format_name(device)) {}
80
- };
81
-
82
- // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
83
- // and consolidate the init functions and simplify object lifetime management. As it currently stands,
84
- // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
85
- // is only created when a device is set and vulkan is explicitly turned on.
86
- static ggml_kompute_context *s_kompute_context = nullptr;
87
-
88
- class kompute_manager {
89
- kp::Manager *s_mgr = nullptr;
90
-
91
- public:
92
- kp::Manager *operator()() {
93
- if (s_mgr && !s_mgr->hasInstance()) {
94
- destroy();
95
- }
96
- if (!s_mgr) {
97
- s_mgr = new kp::Manager;
98
- }
99
- return s_mgr;
100
- }
101
-
102
- void destroy() {
103
- delete s_mgr;
104
- s_mgr = nullptr;
105
- }
106
- };
107
-
108
- static kompute_manager komputeManager;
109
-
110
- struct ggml_vk_memory {
111
- void *data = nullptr;
112
- size_t size = 0;
113
- vk::DeviceMemory *primaryMemory = nullptr;
114
- vk::Buffer *primaryBuffer = nullptr;
115
- vk::DeviceMemory *stagingMemory = nullptr;
116
- vk::Buffer *stagingBuffer = nullptr;
117
- };
118
-
119
- #ifdef __linux__
120
- __attribute__((constructor))
121
- static void enable_sam() {
122
- setenv("RADV_PERFTEST", "sam", false);
123
- }
124
- #endif
125
-
126
- static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physical_device) {
127
- vk::PhysicalDeviceFeatures availableFeatures;
128
- physical_device.getFeatures(&availableFeatures);
129
-
130
- if (!availableFeatures.shaderInt16)
131
- return false;
132
-
133
- vk::PhysicalDeviceVulkan11Features availableFeatures11;
134
- vk::PhysicalDeviceVulkan12Features availableFeatures12;
135
-
136
- availableFeatures11.pNext = &availableFeatures12;
137
- availableFeatures12.pNext = nullptr;
138
-
139
- vk::PhysicalDeviceFeatures2 features2;
140
- features2.pNext = &availableFeatures11;
141
-
142
- physical_device.getFeatures2(&features2);
143
-
144
- if (!availableFeatures11.uniformAndStorageBuffer16BitAccess ||
145
- !availableFeatures11.storageBuffer16BitAccess) {
146
- return false;
147
- }
148
-
149
- if (!availableFeatures12.storageBuffer8BitAccess ||
150
- !availableFeatures12.uniformAndStorageBuffer8BitAccess ||
151
- !availableFeatures12.shaderFloat16 ||
152
- !availableFeatures12.shaderInt8) {
153
- return false;
154
- }
155
-
156
- return true;
157
- }
158
-
159
- static const char * ggml_vk_getVendorName(uint32_t vendorID) {
160
- switch (vendorID) {
161
- case 0x10DE:
162
- return "nvidia";
163
- case 0x1002:
164
- return "amd";
165
- case 0x8086:
166
- return "intel";
167
- default:
168
- return "unknown";
169
- }
170
- }
171
-
172
- static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t memoryRequired) {
173
- std::vector<ggml_vk_device> results;
174
- if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
175
- return results;
176
-
177
- std::vector<vk::PhysicalDevice> physical_devices;
178
- try {
179
- physical_devices = komputeManager()->listDevices();
180
- } catch (vk::SystemError & err) {
181
- std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n";
182
- return results;
183
- }
184
-
185
- uint32_t deviceCount = physical_devices.size();
186
- if (deviceCount == 0)
187
- return results;
188
-
189
- std::unordered_map<std::string, size_t> count_by_name;
190
-
191
- for (uint32_t i = 0; i < deviceCount; i++) {
192
- const auto & physical_device = physical_devices[i];
193
-
194
- VkPhysicalDeviceProperties dev_props = physical_device.getProperties();
195
- VkPhysicalDeviceMemoryProperties memoryProperties = physical_device.getMemoryProperties();
196
- const uint32_t major = VK_VERSION_MAJOR(dev_props.apiVersion);
197
- const uint32_t minor = VK_VERSION_MINOR(dev_props.apiVersion);
198
- if (major < 1 || minor < 2)
199
- continue;
200
-
201
- if (!ggml_vk_checkPhysicalDeviceFeatures(physical_device))
202
- continue;
203
-
204
- size_t heapSize = 0;
205
- for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) {
206
- VkMemoryHeap heap = memoryProperties.memoryHeaps[j];
207
- if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
208
- heapSize = heap.size;
209
- break;
210
- }
211
- }
212
-
213
- if (heapSize < memoryRequired)
214
- continue;
215
-
216
- auto ext_props = physical_device.enumerateDeviceExtensionProperties();
217
- bool has_maintenance4 = false;
218
-
219
- // Check if maintenance4 is supported
220
- for (const auto & properties : ext_props) {
221
- if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
222
- has_maintenance4 = true;
223
- }
224
- }
225
-
226
- vk::PhysicalDeviceSubgroupProperties subgroup_props;
227
- vk::PhysicalDeviceProperties2 dev_props2;
228
- vk::PhysicalDeviceMaintenance3Properties dev_props3;
229
- vk::PhysicalDeviceMaintenance4Properties dev_props4;
230
- dev_props2.pNext = &dev_props3;
231
- dev_props3.pNext = &subgroup_props;
232
- if (has_maintenance4) {
233
- subgroup_props.pNext = &dev_props4;
234
- }
235
- physical_device.getProperties2(&dev_props2);
236
-
237
- if (subgroup_props.subgroupSize < 32)
238
- continue;
239
-
240
- ggml_vk_device d;
241
- d.index = i;
242
- d.type = dev_props.deviceType;
243
- d.heapSize = heapSize;
244
- d.vendor = strdup(ggml_vk_getVendorName(dev_props.vendorID));
245
- d.subgroupSize = subgroup_props.subgroupSize;
246
- d.bufferAlignment = dev_props.limits.minStorageBufferOffsetAlignment;
247
-
248
- if (has_maintenance4) {
249
- d.maxAlloc = std::min(dev_props3.maxMemoryAllocationSize, dev_props4.maxBufferSize);
250
- } else {
251
- d.maxAlloc = dev_props3.maxMemoryAllocationSize;
252
- }
253
-
254
- std::string name(dev_props.deviceName);
255
- size_t n_idx = ++count_by_name[name];
256
- if (n_idx > 1) {
257
- name += " (" + std::to_string(n_idx) + ")";
258
- }
259
- d.name = strdup(name.c_str());
260
-
261
- results.push_back(d);
262
- }
263
-
264
- std::stable_sort(results.begin(), results.end(),
265
- [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool {
266
- if (lhs.type != rhs.type) {
267
- if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true;
268
- if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false;
269
-
270
- if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true;
271
- if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false;
272
- }
273
- return lhs.heapSize < rhs.heapSize;
274
- }
275
- );
276
-
277
- return results;
278
- }
279
-
280
- static std::vector<ggml_vk_device>& ggml_vk_available_devices() {
281
- static std::vector<ggml_vk_device> devices = ggml_vk_available_devices_internal(0);
282
- return devices;
283
- }
284
-
285
- static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
286
- devices.erase(
287
- std::remove_if(devices.begin(), devices.end(),
288
- [&targetVendor](const ggml_vk_device& device) {
289
- return device.vendor != targetVendor;
290
- }),
291
- devices.end()
292
- );
293
- }
294
-
295
- static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std::string& targetName) {
296
- devices.erase(
297
- std::remove_if(devices.begin(), devices.end(),
298
- [&targetName](const ggml_vk_device& device) {
299
- return device.name != targetName;
300
- }),
301
- devices.end()
302
- );
303
- }
304
-
305
- static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) {
306
- if (name.empty())
307
- return false;
308
-
309
- auto devices = ggml_vk_available_devices_internal(memoryRequired);
310
- if (name == "amd" || name == "nvidia" || name == "intel") {
311
- ggml_vk_filterByVendor(devices, name);
312
- } else if (name != "gpu") {
313
- ggml_vk_filterByName(devices, name);
314
- }
315
-
316
- if (devices.empty())
317
- return false;
318
-
319
- *device = devices.front();
320
- return true;
321
- }
322
-
323
- bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) {
324
- return ggml_vk_get_device(device, memoryRequired, std::string(name));
325
- }
326
-
327
- bool ggml_vk_has_vulkan() {
328
- return komputeManager()->hasVulkan();
329
- }
330
-
331
- bool ggml_vk_has_device() {
332
- return komputeManager()->hasDevice();
333
- }
334
-
335
- ggml_vk_device ggml_vk_current_device() {
336
- if (!komputeManager()->hasDevice())
337
- return ggml_vk_device();
338
-
339
- auto devices = ggml_vk_available_devices();
340
- ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
341
- GGML_ASSERT(!devices.empty());
342
- return devices.front();
343
- }
344
-
345
- static
346
- void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
347
- std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
348
- vk::DescriptorPoolSize(
349
- vk::DescriptorType::eStorageBuffer,
350
- 4 * size // Descriptor count is number of possible tensors to pass into an algorithm
351
- )
352
- };
353
-
354
- vk::DescriptorPoolCreateInfo descriptorPoolInfo(
355
- vk::DescriptorPoolCreateFlags(),
356
- size, // Max sets
357
- static_cast<uint32_t>(descriptorPoolSizes.size()),
358
- descriptorPoolSizes.data());
359
-
360
- ctx->pool = std::make_shared<vk::DescriptorPool>();
361
- vk::Result r = komputeManager()->device()->createDescriptorPool(
362
- &descriptorPoolInfo, nullptr, ctx->pool.get());
363
- if (r != vk::Result::eSuccess)
364
- std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
365
- }
366
-
367
- static
368
- void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
369
- if (ctx->pool) {
370
- komputeManager()->device()->destroy(
371
- *ctx->pool,
372
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
373
- ctx->pool = nullptr;
374
- }
375
- }
376
-
377
- static
378
- vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
379
- vk::BufferCreateInfo bufferCreateInfo;
380
- bufferCreateInfo.size = size;
381
- bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer |
382
- vk::BufferUsageFlagBits::eTransferSrc |
383
- vk::BufferUsageFlagBits::eTransferDst;
384
- bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
385
-
386
- vk::Buffer *vkBuffer = new vk::Buffer;
387
- vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
388
- if (r != vk::Result::eSuccess)
389
- std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl;
390
- return vkBuffer;
391
- }
392
-
393
- static
394
- vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
395
-
396
- uint32_t memoryTypeIndex = -1;
397
- bool memoryTypeIndexFound = false;
398
- vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
399
- for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
400
- const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i];
401
- const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex];
402
- if (memoryHeap.size < size) {
403
- continue;
404
- }
405
-
406
- if (requirements.memoryTypeBits & (1 << i)) {
407
- if (((memoryProperties.memoryTypes[i]).propertyFlags &
408
- flags) == flags) {
409
- memoryTypeIndex = i;
410
- memoryTypeIndexFound = true;
411
- if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) {
412
- *isHostVisible = true;
413
- }
414
- break;
415
- }
416
- }
417
- }
418
- if (!memoryTypeIndexFound) {
419
- throw std::runtime_error(
420
- "Memory type index for buffer creation not found");
421
- }
422
-
423
- vk::MemoryAllocateInfo allocInfo;
424
- allocInfo.allocationSize = size;
425
- allocInfo.memoryTypeIndex = memoryTypeIndex;
426
- vk::DeviceMemory *vkDeviceMemory = new vk::DeviceMemory;
427
- vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
428
- if (r != vk::Result::eSuccess) {
429
- std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl;
430
- throw std::runtime_error("Error allocating vulkan memory.");
431
- }
432
- return vkDeviceMemory;
433
- }
434
-
435
- static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) {
436
- size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer);
437
-
438
- // If offset is already aligned, return it directly
439
- if (offset % minStorageBufferOffsetAlignment == 0) {
440
- return offset;
441
- }
442
-
443
- // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset
444
- return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
445
- }
446
-
447
- static ggml_vk_memory ggml_vk_allocate(size_t size) {
448
- ggml_vk_memory memory;
449
- bool isHostVisible = false;
450
- {
451
- memory.primaryBuffer = ggml_vk_allocate_buffer(size);
452
- vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer);
453
- vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
454
- memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
455
- komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
456
- if (isHostVisible) {
457
- vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
458
- if (r != vk::Result::eSuccess)
459
- std::cerr << "Error mapping memory" << vk::to_string(r);
460
- }
461
- }
462
-
463
- if (!isHostVisible) {
464
- memory.stagingBuffer = ggml_vk_allocate_buffer(size);
465
- vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer);
466
- vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
467
- vk::MemoryPropertyFlagBits::eHostCoherent |
468
- vk::MemoryPropertyFlagBits::eHostCached;
469
- memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
470
- komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
471
- vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
472
- if (r != vk::Result::eSuccess)
473
- std::cerr << "Error mapping memory" << vk::to_string(r);
474
- }
475
-
476
- memory.size = size;
477
- return memory;
478
- }
479
-
480
- static void ggml_vk_free_memory(ggml_vk_memory &memory)
481
- {
482
- komputeManager()->device()->destroy(
483
- *memory.primaryBuffer,
484
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
485
- if (memory.stagingBuffer) {
486
- komputeManager()->device()->destroy(
487
- *memory.stagingBuffer,
488
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
489
- }
490
- komputeManager()->device()->freeMemory(
491
- *memory.primaryMemory,
492
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
493
- if (memory.stagingMemory) {
494
- komputeManager()->device()->freeMemory(
495
- *memory.stagingMemory,
496
- (vk::Optional<const vk::AllocationCallbacks>)nullptr);
497
- }
498
- }
499
-
500
- static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft);
501
-
502
- static
503
- ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
504
- ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
505
-
506
- // compatibility with ggml-backend
507
- GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name);
508
-
509
- ggml_vk_memory * buf_ctx = static_cast<ggml_vk_memory *>(buffer->context);
510
-
511
- const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data);
512
-
513
- GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size));
514
-
515
- offset = uint64_t(ioffs);
516
- return buf_ctx;
517
- }
518
-
519
- static
520
- const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
521
- uint64_t originalOffset = 0;
522
- auto * res = ggml_vk_find_tensor(t, originalOffset);
523
- if (!res) {
524
- static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
525
- return nullTensor;
526
- }
527
-
528
- // Create a tensor whose memory will be composed of our buffers at the correct offset
529
- const size_t nelements = ggml_nelements(t);
530
- size_t nbytes = ggml_nbytes(t);
531
-
532
- size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset);
533
- if (alignedOffset) {
534
- *alignedOffset = originalOffset - vulkanOffset;
535
- nbytes += *alignedOffset;
536
- }
537
-
538
- return komputeManager()->tensor(
539
- t->data,
540
- nelements,
541
- nbytes, kp::Tensor::TensorDataTypes::eFloat,
542
- res->primaryMemory, res->primaryBuffer,
543
- res->stagingMemory, res->stagingBuffer,
544
- vulkanOffset);
545
- }
546
-
547
- static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
548
- if (size % sizeof(uint32_t) != 0) {
549
- throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
550
- }
551
-
552
- const uint32_t* data_ptr = reinterpret_cast<const uint32_t*>(rawData);
553
- size_t count = size / sizeof(uint32_t);
554
- return std::vector<uint32_t>(data_ptr, data_ptr + count);
555
- }
556
-
557
- inline static
558
- uint32_t safe_divide(uint32_t a, uint32_t b) {
559
- if (b <= 1) {
560
- return a;
561
- }
562
- if ((a % b) != 0) {
563
- fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
564
- GGML_ABORT("safe_divide result would've had remainder");
565
- }
566
- return a / b;
567
- }
568
-
569
- static void ggml_vk_add(
570
- kp::Sequence& seq,
571
- const std::shared_ptr<kp::Tensor>& inA,
572
- const std::shared_ptr<kp::Tensor>& inB,
573
- const std::shared_ptr<kp::Tensor>& out,
574
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
575
- int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
576
- int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
577
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
578
- int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
579
- int32_t ne0,
580
- int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3
581
- ) {
582
- const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
583
- kp::shader_data::op_add_comp_spv_len);
584
-
585
- struct PushConstants {
586
- uint32_t inAOff, inBOff, outOff;
587
- int32_t ne00;
588
- int32_t nb00, nb01, nb02, nb03;
589
- int32_t ne10, ne11, ne12, ne13;
590
- int32_t nb10, nb11, nb12, nb13;
591
- int32_t ne0;
592
- int32_t nb0, nb1, nb2, nb3;
593
- } const pushConsts {
594
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
595
- ne00,
596
- nb00, nb01, nb02, nb03,
597
- ne10, ne11, ne12, ne13,
598
- nb10, nb11, nb12, nb13,
599
- ne0,
600
- nb0, nb1, nb2, nb3
601
- };
602
-
603
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
604
- if (!komputeManager()->hasAlgorithm(__func__)) {
605
- s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
606
- } else {
607
- s_algo = komputeManager()->getAlgorithm(__func__);
608
- s_algo->setTensors({inA, inB, out});
609
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
610
- s_algo->setPushConstants<PushConstants>({pushConsts});
611
- s_algo->updateDescriptors(s_kompute_context->pool.get());
612
- }
613
- seq.record<kp::OpAlgoDispatch>(s_algo);
614
- }
615
-
616
- static void ggml_vk_addrow(kp::Sequence& seq,
617
- const std::shared_ptr<kp::Tensor>& inA,
618
- const std::shared_ptr<kp::Tensor>& inB,
619
- const std::shared_ptr<kp::Tensor>& out,
620
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
621
- uint32_t size, uint32_t row = 0) {
622
-
623
- const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv,
624
- kp::shader_data::op_addrow_comp_spv_len);
625
-
626
- struct PushConstants {
627
- uint32_t inAOff, inBOff, outOff;
628
- uint32_t row;
629
- } const pushConsts {
630
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
631
- row
632
- };
633
-
634
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
635
- if (!komputeManager()->hasAlgorithm(__func__))
636
- s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
637
- else {
638
- s_algo = komputeManager()->getAlgorithm(__func__);
639
- s_algo->setTensors({inA, inB, out});
640
- s_algo->setWorkgroup({size});
641
- s_algo->setPushConstants<PushConstants>({pushConsts});
642
- s_algo->updateDescriptors(s_kompute_context->pool.get());
643
- }
644
- seq.record<kp::OpAlgoDispatch>(s_algo);
645
- }
646
-
647
- static void ggml_vk_mul(
648
- kp::Sequence& seq,
649
- const std::shared_ptr<kp::Tensor>& inA,
650
- const std::shared_ptr<kp::Tensor>& inB,
651
- const std::shared_ptr<kp::Tensor>& out,
652
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
653
- int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
654
- int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
655
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
656
- int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
657
- int32_t ne0,
658
- int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3
659
- ) {
660
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv,
661
- kp::shader_data::op_mul_comp_spv_len);
662
-
663
- struct PushConstants {
664
- uint32_t inAOff, inBOff, outOff;
665
- int32_t ne00;
666
- int32_t nb00, nb01, nb02, nb03;
667
- int32_t ne10, ne11, ne12, ne13;
668
- int32_t nb10, nb11, nb12, nb13;
669
- int32_t ne0;
670
- int32_t nb0, nb1, nb2, nb3;
671
- } const pushConsts {
672
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
673
- ne00,
674
- nb00, nb01, nb02, nb03,
675
- ne10, ne11, ne12, ne13,
676
- nb10, nb11, nb12, nb13,
677
- ne0,
678
- nb0, nb1, nb2, nb3
679
- };
680
-
681
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
682
- if (!komputeManager()->hasAlgorithm(__func__)) {
683
- s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
684
- } else {
685
- s_algo = komputeManager()->getAlgorithm(__func__);
686
- s_algo->setTensors({inA, inB, out});
687
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
688
- s_algo->setPushConstants<PushConstants>({pushConsts});
689
- s_algo->updateDescriptors(s_kompute_context->pool.get());
690
- }
691
- seq.record<kp::OpAlgoDispatch>(s_algo);
692
- }
693
-
694
- static void ggml_vk_scale(kp::Sequence& seq,
695
- const std::shared_ptr<kp::Tensor>& in,
696
- const std::shared_ptr<kp::Tensor>& out,
697
- uint32_t inOff, uint32_t outOff,
698
- uint32_t size, float scale) {
699
- const static auto spirv_1 = getSpirvShader(
700
- kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len
701
- );
702
- const static auto spirv_8 = getSpirvShader(
703
- kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len
704
- );
705
-
706
- struct PushConstants {
707
- uint32_t inOff, outOff;
708
- float scale;
709
- } const pushConsts {
710
- safe_divide(inOff, 4), safe_divide(outOff, 4),
711
- scale
712
- };
713
-
714
- const auto * spirv = &spirv_1;
715
- std::string name(__func__);
716
- if (size % 8 == 0) {
717
- size /= 8;
718
- name += "_8";
719
- spirv = &spirv_8;
720
- }
721
-
722
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
723
- if (!komputeManager()->hasAlgorithm(name)) {
724
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
725
- } else {
726
- s_algo = komputeManager()->getAlgorithm(name);
727
- s_algo->setTensors({in, out});
728
- s_algo->setWorkgroup({size});
729
- s_algo->setPushConstants<PushConstants>({pushConsts});
730
- s_algo->updateDescriptors(s_kompute_context->pool.get());
731
- }
732
- seq.record<kp::OpAlgoDispatch>(s_algo);
733
- }
734
-
735
- static void ggml_vk_xxlu(
736
- const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
737
- const std::shared_ptr<kp::Tensor>& in,
738
- const std::shared_ptr<kp::Tensor>& out,
739
- uint32_t inOff, uint32_t outOff,
740
- uint32_t size
741
- ) {
742
- struct PushConstants {
743
- uint32_t inOff, outOff;
744
- } const pushConsts {
745
- safe_divide(inOff, 4), safe_divide(outOff, 4),
746
- };
747
-
748
- auto name = std::string(__func__) + "_" + suffix;
749
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
750
- if (!komputeManager()->hasAlgorithm(name)) {
751
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
752
- } else {
753
- s_algo = komputeManager()->getAlgorithm(name);
754
- s_algo->setTensors({in, out});
755
- s_algo->setWorkgroup({size});
756
- s_algo->setPushConstants<PushConstants>({pushConsts});
757
- s_algo->updateDescriptors(s_kompute_context->pool.get());
758
- }
759
- seq.record<kp::OpAlgoDispatch>(s_algo);
760
- }
761
-
762
- template <typename... Args>
763
- static void ggml_vk_silu(Args&&... args) {
764
- const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
765
- kp::shader_data::op_silu_comp_spv_len);
766
-
767
- ggml_vk_xxlu(spirv, "silu", std::forward<Args>(args)...);
768
- }
769
-
770
- template <typename... Args>
771
- static void ggml_vk_relu(Args&&... args) {
772
- const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
773
- kp::shader_data::op_relu_comp_spv_len);
774
-
775
- ggml_vk_xxlu(spirv, "relu", std::forward<Args>(args)...);
776
- }
777
-
778
- template <typename... Args>
779
- static void ggml_vk_gelu(Args&&... args) {
780
- const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
781
- kp::shader_data::op_gelu_comp_spv_len);
782
-
783
- ggml_vk_xxlu(spirv, "gelu", std::forward<Args>(args)...);
784
- }
785
-
786
- static void ggml_vk_soft_max(
787
- kp::Sequence& seq,
788
- const std::shared_ptr<kp::Tensor>& inA,
789
- const std::shared_ptr<kp::Tensor>& inB,
790
- const std::shared_ptr<kp::Tensor>& out,
791
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
792
- int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
793
- float scale, float max_bias, float m0, float m1,
794
- uint32_t n_head_log2
795
- ) {
796
- const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
797
- kp::shader_data::op_softmax_comp_spv_len);
798
-
799
- struct PushConstants {
800
- uint32_t inAOff, inBOff, outOff;
801
- int32_t ne00, ne01, ne02;
802
- float scale, max_bias, m0, m1;
803
- uint32_t n_head_log2;
804
- int32_t mask;
805
- } pushConsts {
806
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
807
- ne00, ne01, ne02,
808
- scale, max_bias, m0, m1,
809
- n_head_log2,
810
- bool(inB)
811
- };
812
-
813
- auto & inB_ = inB ? inB : inA;
814
-
815
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
816
- if (!komputeManager()->hasAlgorithm(__func__)) {
817
- // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
818
- const uint32_t local_x = 32;
819
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
820
- } else {
821
- s_algo = komputeManager()->getAlgorithm(__func__);
822
- s_algo->setTensors({inA, inB_, out});
823
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
824
- s_algo->setPushConstants<PushConstants>({pushConsts});
825
- s_algo->updateDescriptors(s_kompute_context->pool.get());
826
- }
827
- seq.record<kp::OpAlgoDispatch>(s_algo);
828
- }
829
-
830
- static void ggml_vk_norm_(
831
- const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
832
- const std::shared_ptr<kp::Tensor>& in,
833
- const std::shared_ptr<kp::Tensor>& out,
834
- uint32_t inOff, uint32_t outOff,
835
- int32_t ne00, int32_t nb01,
836
- int32_t nrows, float epsilon
837
- ) {
838
- GGML_ASSERT(nb01%sizeof(float) == 0);
839
- GGML_ASSERT(ne00%sizeof(float) == 0);
840
-
841
- struct PushConstants {
842
- uint32_t inOff, outOff;
843
- uint32_t ne00, nb01;
844
- float eps;
845
- } pushConsts {
846
- safe_divide(inOff, 4), safe_divide(outOff, 4),
847
- (uint32_t)ne00, (uint32_t)nb01, epsilon
848
- };
849
-
850
- auto name = std::string(__func__) + "_" + suffix;
851
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
852
- if (!komputeManager()->hasAlgorithm(name)) {
853
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
854
- } else {
855
- s_algo = komputeManager()->getAlgorithm(name);
856
- s_algo->setTensors({in, out});
857
- s_algo->setWorkgroup({(uint32_t)nrows});
858
- s_algo->setPushConstants<PushConstants>({pushConsts});
859
- s_algo->updateDescriptors(s_kompute_context->pool.get());
860
- }
861
- seq.record<kp::OpAlgoDispatch>(s_algo);
862
- }
863
-
864
- template <typename... Args>
865
- static void ggml_vk_norm(Args&&... args) {
866
- const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
867
- kp::shader_data::op_norm_comp_spv_len);
868
-
869
- ggml_vk_norm_(spirv, "norm", std::forward<Args>(args)...);
870
- }
871
-
872
- template <typename... Args>
873
- static void ggml_vk_rms_norm(Args&&... args) {
874
- const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
875
- kp::shader_data::op_rmsnorm_comp_spv_len);
876
-
877
- ggml_vk_norm_(spirv, "rms", std::forward<Args>(args)...);
878
- }
879
-
880
- static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
881
- const std::shared_ptr<kp::Tensor>& in,
882
- const std::shared_ptr<kp::Tensor>& out,
883
- uint32_t inOff, uint32_t outOff,
884
- uint32_t n_past,
885
- int32_t ne00, int32_t ne01, int32_t ne02) {
886
- const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv,
887
- kp::shader_data::op_diagmask_comp_spv_len);
888
-
889
- struct PushConstants {
890
- uint32_t inOff, outOff;
891
- uint32_t n_past;
892
- int32_t ne00, ne01;
893
- } pushConsts {
894
- safe_divide(inOff, 4), safe_divide(outOff, 4),
895
- n_past,
896
- ne00, ne01
897
- };
898
-
899
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
900
- if (!komputeManager()->hasAlgorithm(__func__))
901
- s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
902
- else {
903
- s_algo = komputeManager()->getAlgorithm(__func__);
904
- s_algo->setTensors({in, out});
905
- s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
906
- s_algo->setPushConstants<PushConstants>({pushConsts});
907
- s_algo->updateDescriptors(s_kompute_context->pool.get());
908
- }
909
- seq.record<kp::OpAlgoDispatch>(s_algo);
910
- }
911
-
912
- static void ggml_vk_mul_mat_f16(
913
- kp::Sequence& seq,
914
- const std::shared_ptr<kp::Tensor>& inA,
915
- const std::shared_ptr<kp::Tensor>& inB,
916
- const std::shared_ptr<kp::Tensor>& out,
917
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
918
- int32_t ne00, int32_t ne01, int32_t ne02,
919
- uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
920
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
921
- uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
922
- int32_t ne0, int32_t ne1,
923
- uint32_t r2, uint32_t r3
924
- ) {
925
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv,
926
- kp::shader_data::op_mul_mat_f16_comp_spv_len);
927
-
928
- struct PushConstants {
929
- uint32_t inAOff, inBOff, outOff;
930
- int32_t ne00, ne01, ne02;
931
- uint32_t nb00, nb01, nb02, nb03;
932
- int32_t ne10, ne11, ne12;
933
- uint32_t nb10, nb11, nb12, nb13;
934
- int32_t ne0, ne1;
935
- uint32_t r2, r3;
936
- } pushConsts {
937
- safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
938
- ne00, ne01, ne02,
939
- nb00, nb01, nb02, nb03,
940
- ne10, ne11, ne12,
941
- nb10, nb11, nb12, nb13,
942
- ne0, ne1,
943
- r2, r3
944
- };
945
-
946
- const unsigned ny = unsigned((ne11 + 4 - 1)/4);
947
-
948
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
949
- if (!komputeManager()->hasAlgorithm(__func__)) {
950
- const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
951
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts});
952
- } else {
953
- s_algo = komputeManager()->getAlgorithm(__func__);
954
- s_algo->setTensors({inA, inB, out});
955
- s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)});
956
- s_algo->setPushConstants<PushConstants>({pushConsts});
957
- s_algo->updateDescriptors(s_kompute_context->pool.get());
958
- }
959
- seq.record<kp::OpAlgoDispatch>(s_algo);
960
- }
961
-
962
- static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
963
- const std::shared_ptr<kp::Tensor>& inA,
964
- const std::shared_ptr<kp::Tensor>& inB,
965
- const std::shared_ptr<kp::Tensor>& out,
966
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
967
- int32_t ne00, int32_t ne01, int32_t ne02,
968
- uint32_t nb01, uint32_t nb02,
969
- int32_t ne11, int32_t ne12,
970
- uint32_t nb11, uint32_t nb12,
971
- uint32_t nb1, uint32_t nb2) {
972
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv,
973
- kp::shader_data::op_mul_mat_mat_f32_comp_spv_len);
974
-
975
- struct PushConstants {
976
- uint32_t inAOff, inBOff, outOff;
977
- int32_t ne00, ne01, ne02, ne11, ne12;
978
- uint32_t nb01, nb02;
979
- uint32_t nb11, nb12;
980
- uint32_t nb1, nb2;
981
- } pushConsts {
982
- safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
983
- ne00, ne01, ne02, ne11, ne12,
984
- nb01, nb02, nb11, nb12,
985
- nb1, nb2
986
- };
987
-
988
- const uint32_t local_x = ggml_vk_current_device().subgroupSize;
989
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
990
- if (!komputeManager()->hasAlgorithm(__func__)) {
991
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
992
- {inA, inB, out}, spirv,
993
- {unsigned(ne01),
994
- unsigned(ne11),
995
- unsigned(std::max(ne12, ne02))
996
- },
997
- {local_x},
998
- {pushConsts});
999
- } else {
1000
- s_algo = komputeManager()->getAlgorithm(__func__);
1001
- s_algo->setTensors({inA, inB, out});
1002
- s_algo->setWorkgroup({unsigned(ne01),
1003
- unsigned(ne11),
1004
- unsigned(std::max(ne12, ne02)),
1005
- });
1006
- s_algo->setPushConstants<PushConstants>({pushConsts});
1007
- s_algo->updateDescriptors(s_kompute_context->pool.get());
1008
- }
1009
- seq.record<kp::OpAlgoDispatch>(s_algo);
1010
- }
1011
-
1012
- static void ggml_vk_mul_mat_impl(
1013
- const std::vector<uint32_t>& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq,
1014
- const std::shared_ptr<kp::Tensor>& inA,
1015
- const std::shared_ptr<kp::Tensor>& inB,
1016
- const std::shared_ptr<kp::Tensor>& out,
1017
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1018
- int32_t ne00, int32_t ne01, int32_t ne02,
1019
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
1020
- int32_t ne0, int32_t ne1,
1021
- uint32_t nb01, uint32_t nb02, uint32_t nb03,
1022
- uint32_t nb11, uint32_t nb12, uint32_t nb13,
1023
- uint32_t r2, uint32_t r3
1024
- ) {
1025
- struct PushConstants {
1026
- uint32_t inAOff, inBOff, outOff;
1027
- int32_t ne00, ne01, ne02;
1028
- int32_t ne10, ne12;
1029
- int32_t ne0, ne1;
1030
- uint32_t nb01, nb02, nb03;
1031
- uint32_t nb11, nb12, nb13;
1032
- uint32_t r2, r3;
1033
- } pushConsts {
1034
- safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
1035
- ne00, ne01, ne02,
1036
- ne10, ne12,
1037
- ne0, ne1,
1038
- nb01, nb02, nb03,
1039
- nb11, nb12, nb13,
1040
- r2, r3
1041
- };
1042
-
1043
- auto name = std::string(__func__) + "_" + suffix;
1044
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1045
- if (!komputeManager()->hasAlgorithm(name)) {
1046
- const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
1047
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
1048
- } else {
1049
- s_algo = komputeManager()->getAlgorithm(name);
1050
- s_algo->setTensors({inA, inB, out});
1051
- s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)});
1052
- s_algo->setPushConstants<PushConstants>({pushConsts});
1053
- s_algo->updateDescriptors(s_kompute_context->pool.get());
1054
- }
1055
- seq.record<kp::OpAlgoDispatch>(s_algo);
1056
- }
1057
-
1058
- template <typename... Args>
1059
- static void ggml_vk_mul_mat_q4_0(Args&&... args) {
1060
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
1061
- kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
1062
-
1063
- ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
1064
- }
1065
-
1066
- template <typename... Args>
1067
- static void ggml_vk_mul_mat_q4_1(Args&&... args) {
1068
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
1069
- kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
1070
-
1071
- ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
1072
- }
1073
-
1074
- template <typename... Args>
1075
- static void ggml_vk_mul_mat_q8_0(Args&&... args) {
1076
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
1077
- kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
1078
-
1079
- ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
1080
- }
1081
-
1082
- static void ggml_vk_mul_mat_q4_k(
1083
- kp::Sequence& seq,
1084
- const std::shared_ptr<kp::Tensor>& inA,
1085
- const std::shared_ptr<kp::Tensor>& inB,
1086
- const std::shared_ptr<kp::Tensor>& out,
1087
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1088
- int32_t ne00, int32_t ne01, int32_t ne02,
1089
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
1090
- int32_t ne0, int32_t ne1,
1091
- uint32_t nb01, uint32_t nb02, uint32_t nb03,
1092
- uint32_t nb11, uint32_t nb12, uint32_t nb13,
1093
- uint32_t r2, uint32_t r3
1094
- ) {
1095
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
1096
- kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
1097
-
1098
- struct PushConstants {
1099
- uint32_t inAOff, inBOff, outOff;
1100
- int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
1101
- uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
1102
- uint32_t r2, r3;
1103
- } pushConsts {
1104
- inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
1105
- ne00, ne10, ne0, ne1, ne01, ne02, ne12,
1106
- nb01, nb02, nb03, nb11, nb12, nb13,
1107
- r2, r3
1108
- };
1109
-
1110
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1111
- if (!komputeManager()->hasAlgorithm(__func__)) {
1112
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts});
1113
- } else {
1114
- s_algo = komputeManager()->getAlgorithm(__func__);
1115
- s_algo->setTensors({inA, inB, out});
1116
- s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)});
1117
- s_algo->setPushConstants<PushConstants>({pushConsts});
1118
- s_algo->updateDescriptors(s_kompute_context->pool.get());
1119
- }
1120
- seq.record<kp::OpAlgoDispatch>(s_algo);
1121
- }
1122
-
1123
- static void ggml_vk_mul_mat_q6_k(
1124
- kp::Sequence& seq,
1125
- const std::shared_ptr<kp::Tensor>& inA,
1126
- const std::shared_ptr<kp::Tensor>& inB,
1127
- const std::shared_ptr<kp::Tensor>& out,
1128
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1129
- int32_t ne00, int32_t ne01, int32_t ne02,
1130
- int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
1131
- int32_t ne0, int32_t ne1,
1132
- uint32_t nb01, uint32_t nb02, uint32_t nb03,
1133
- uint32_t nb11, uint32_t nb12, uint32_t nb13,
1134
- uint32_t r2, uint32_t r3
1135
- ) {
1136
- const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
1137
- kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
1138
-
1139
- struct PushConstants {
1140
- uint32_t inAOff, inBOff, outOff;
1141
- int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
1142
- uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
1143
- uint32_t r2, r3;
1144
- } pushConsts {
1145
- inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
1146
- ne00, ne10, ne0, ne1, ne01, ne02, ne12,
1147
- nb01, nb02, nb03, nb11, nb12, nb13,
1148
- r2, r3
1149
- };
1150
-
1151
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1152
- if (!komputeManager()->hasAlgorithm(__func__)) {
1153
- const uint32_t local_x = 2;
1154
- const uint32_t local_y = ggml_vk_current_device().subgroupSize;
1155
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
1156
- } else {
1157
- s_algo = komputeManager()->getAlgorithm(__func__);
1158
- s_algo->setTensors({inA, inB, out});
1159
- s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
1160
- s_algo->setPushConstants<PushConstants>({pushConsts});
1161
- s_algo->updateDescriptors(s_kompute_context->pool.get());
1162
- }
1163
- seq.record<kp::OpAlgoDispatch>(s_algo);
1164
- }
1165
-
1166
- static void ggml_vk_get_rows(
1167
- const std::vector<uint32_t>& spirv,
1168
- const char * suffix,
1169
- unsigned element_size, unsigned qk,
1170
- kp::Sequence& seq,
1171
- const std::shared_ptr<kp::Tensor>& inA,
1172
- const std::shared_ptr<kp::Tensor>& inB,
1173
- const std::shared_ptr<kp::Tensor>& out,
1174
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1175
- int32_t ne00, int32_t nb01, int32_t nb1,
1176
- uint32_t size
1177
- ) {
1178
- GGML_ASSERT(nb01%element_size == 0);
1179
- GGML_ASSERT(nb1%sizeof(float) == 0);
1180
- if (qk) GGML_ASSERT(ne00%qk == 0);
1181
-
1182
- struct PushConstants {
1183
- uint32_t inAOff, inBOff, outOff;
1184
- int32_t ne00, nb01, nb1;
1185
- } pushConsts {
1186
- safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
1187
- ne00, nb01, nb1
1188
- };
1189
-
1190
- auto name = std::string(__func__) + "_" + suffix;
1191
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1192
- if (!komputeManager()->hasAlgorithm(name)) {
1193
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
1194
- } else {
1195
- s_algo = komputeManager()->getAlgorithm(name);
1196
- s_algo->setTensors({inA, inB, out});
1197
- s_algo->setWorkgroup({size});
1198
- s_algo->setPushConstants<PushConstants>({pushConsts});
1199
- s_algo->updateDescriptors(s_kompute_context->pool.get());
1200
- }
1201
- seq.record<kp::OpAlgoDispatch>(s_algo);
1202
- }
1203
-
1204
- template <typename... Args>
1205
- static void ggml_vk_get_rows_f32(Args&&... args) {
1206
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
1207
- kp::shader_data::op_getrows_f32_comp_spv_len);
1208
-
1209
- ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
1210
- }
1211
-
1212
- template <typename... Args>
1213
- static void ggml_vk_get_rows_f16(Args&&... args) {
1214
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
1215
- kp::shader_data::op_getrows_f16_comp_spv_len);
1216
-
1217
- ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward<Args>(args)...);
1218
- }
1219
-
1220
- template <typename... Args>
1221
- static void ggml_vk_get_rows_q4_0(Args&&... args) {
1222
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
1223
- kp::shader_data::op_getrows_q4_0_comp_spv_len);
1224
-
1225
- ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
1226
- }
1227
-
1228
- template <typename... Args>
1229
- static void ggml_vk_get_rows_q4_1(Args&&... args) {
1230
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
1231
- kp::shader_data::op_getrows_q4_1_comp_spv_len);
1232
-
1233
- ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
1234
- }
1235
-
1236
- template <typename... Args>
1237
- static void ggml_vk_get_rows_q6_k(Args&&... args) {
1238
- const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
1239
- kp::shader_data::op_getrows_q6_k_comp_spv_len);
1240
- ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
1241
- }
1242
-
1243
- static void ggml_vk_rope(
1244
- kp::Sequence& seq,
1245
- const std::shared_ptr<kp::Tensor>& inA,
1246
- const std::shared_ptr<kp::Tensor>& inB,
1247
- const std::shared_ptr<kp::Tensor>& inC,
1248
- const std::shared_ptr<kp::Tensor>& out,
1249
- uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
1250
- ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
1251
- float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
1252
- int32_t ne01, int32_t ne02, int32_t ne03,
1253
- uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
1254
- int32_t ne0,
1255
- uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
1256
- ) {
1257
- GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
1258
-
1259
- static const auto spirv_norm_f16 = getSpirvShader(
1260
- kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
1261
- );
1262
- static const auto spirv_norm_f32 = getSpirvShader(
1263
- kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
1264
- );
1265
- static const auto spirv_neox_f16 = getSpirvShader(
1266
- kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
1267
- );
1268
- static const auto spirv_neox_f32 = getSpirvShader(
1269
- kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
1270
- );
1271
-
1272
- int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
1273
-
1274
- GGML_ASSERT(nb03 % type_size == 0);
1275
- GGML_ASSERT(nb02 % type_size == 0);
1276
- GGML_ASSERT(nb01 % type_size == 0);
1277
- GGML_ASSERT(nb00 % type_size == 0);
1278
- GGML_ASSERT(nb3 % type_size == 0);
1279
- GGML_ASSERT(nb2 % type_size == 0);
1280
- GGML_ASSERT(nb1 % type_size == 0);
1281
- GGML_ASSERT(nb0 % type_size == 0);
1282
-
1283
- struct PushConstants {
1284
- uint32_t inAOff, inBOff, inCOff, outOff;
1285
- int32_t n_dims, mode, n_ctx_orig;
1286
- float freq_base, freq_scale;
1287
- bool has_freq_factors;
1288
- float ext_factor, attn_factor, beta_fast, beta_slow;
1289
- uint32_t nb00, nb01, nb02, nb03;
1290
- int32_t ne0;
1291
- uint32_t nb0, nb1, nb2, nb3;
1292
- } pushConsts {
1293
- safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
1294
- n_dims, mode, n_ctx_orig,
1295
- freq_base, freq_scale,
1296
- has_freq_factors,
1297
- ext_factor, attn_factor, beta_fast, beta_slow,
1298
- nb00, nb01, nb02, nb03,
1299
- ne0,
1300
- nb0, nb1, nb2, nb3
1301
- };
1302
-
1303
- auto & inC_ = inC ? inC : inA;
1304
- const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
1305
- const bool is_f16 = src0t == GGML_TYPE_F16;
1306
-
1307
- auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
1308
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1309
- if (!komputeManager()->hasAlgorithm(name)) {
1310
- auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
1311
- s_algo = komputeManager()->algorithm<float, PushConstants>(
1312
- name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
1313
- {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
1314
- );
1315
- } else {
1316
- s_algo = komputeManager()->getAlgorithm(name);
1317
- s_algo->setTensors({inA, inB, inC_, out});
1318
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
1319
- s_algo->setPushConstants<PushConstants>({pushConsts});
1320
- s_algo->updateDescriptors(s_kompute_context->pool.get());
1321
- }
1322
- seq.record<kp::OpAlgoDispatch>(s_algo);
1323
- }
1324
-
1325
- static void ggml_vk_cpy(
1326
- const std::vector<uint32_t>& spirv,
1327
- uint32_t in_element_size, uint32_t out_element_size,
1328
- kp::Sequence& seq,
1329
- const std::shared_ptr<kp::Tensor>& in,
1330
- const std::shared_ptr<kp::Tensor>& out,
1331
- uint32_t inOff, uint32_t outOff,
1332
- int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
1333
- uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
1334
- int32_t ne0, int32_t ne1, int32_t ne2,
1335
- uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
1336
- ) {
1337
- struct PushConstants {
1338
- uint32_t inOff, outOff;
1339
- int32_t ne00, ne01, ne02;
1340
- uint32_t nb00, nb01, nb02, nb03;
1341
- int32_t ne0, ne1, ne2;
1342
- uint32_t nb0, nb1, nb2, nb3;
1343
- } pushConsts {
1344
- safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size),
1345
- ne00, ne01, ne02,
1346
- nb00, nb01, nb02, nb03,
1347
- ne0, ne1, ne2,
1348
- nb0, nb1, nb2, nb3
1349
- };
1350
-
1351
- std::string name = std::string(__func__)
1352
- + "_i_" + std::to_string(in_element_size)
1353
- + "_o_" + std::to_string(out_element_size);
1354
- std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1355
- if (!komputeManager()->hasAlgorithm(name))
1356
- s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
1357
- else {
1358
- s_algo = komputeManager()->getAlgorithm(name);
1359
- s_algo->setTensors({in, out});
1360
- s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
1361
- s_algo->setPushConstants<PushConstants>({pushConsts});
1362
- s_algo->updateDescriptors(s_kompute_context->pool.get());
1363
- }
1364
- seq.record<kp::OpAlgoDispatch>(s_algo);
1365
- }
1366
-
1367
- template <typename... Args>
1368
- static void ggml_vk_cpy_f32_f16(Args&&... args) {
1369
- const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
1370
- kp::shader_data::op_cpy_f32_f16_comp_spv_len);
1371
- ggml_vk_cpy(spirv, 4, 2, std::forward<Args>(args)...);
1372
- }
1373
-
1374
- template <typename... Args>
1375
- static void ggml_vk_cpy_f32_f32(Args&&... args) {
1376
- const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
1377
- kp::shader_data::op_cpy_f32_f32_comp_spv_len);
1378
- ggml_vk_cpy(spirv, 4, 4, std::forward<Args>(args)...);
1379
- }
1380
-
1381
- template <typename... Args>
1382
- static void ggml_vk_cpy_f16_f16(Args&&... args) {
1383
- const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
1384
- kp::shader_data::op_cpy_f16_f16_comp_spv_len);
1385
- ggml_vk_cpy(spirv, 2, 2, std::forward<Args>(args)...);
1386
- }
1387
-
1388
- template <typename... Args>
1389
- static void ggml_vk_cpy_f16_f32(Args&&... args) {
1390
- const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
1391
- kp::shader_data::op_cpy_f16_f32_comp_spv_len);
1392
- ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
1393
- }
1394
-
1395
- static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
1396
- int64_t n = ggml_nelements(op);
1397
- switch (op->op) {
1398
- case GGML_OP_UNARY:
1399
- if (n % 4 != 0) return false;
1400
- switch (ggml_get_unary_op(op)) {
1401
- case GGML_UNARY_OP_GELU:
1402
- if (n % 8 != 0) return false;
1403
- // fall through
1404
- case GGML_UNARY_OP_RELU:
1405
- case GGML_UNARY_OP_SILU:
1406
- return ggml_is_contiguous(op->src[0]);
1407
- default:
1408
- ;
1409
- }
1410
- break;
1411
- case GGML_OP_NONE:
1412
- case GGML_OP_RESHAPE:
1413
- case GGML_OP_VIEW:
1414
- case GGML_OP_TRANSPOSE:
1415
- case GGML_OP_PERMUTE:
1416
- case GGML_OP_ADD:
1417
- case GGML_OP_MUL:
1418
- case GGML_OP_SCALE:
1419
- case GGML_OP_SOFT_MAX:
1420
- case GGML_OP_RMS_NORM:
1421
- case GGML_OP_NORM:
1422
- return true;
1423
- case GGML_OP_ROPE:
1424
- {
1425
- const int mode = ((const int32_t *) op->op_params)[2];
1426
- if (mode & GGML_ROPE_TYPE_MROPE) {
1427
- return false;
1428
- }
1429
- if (mode & GGML_ROPE_TYPE_VISION) {
1430
- return false;
1431
- }
1432
- return true;
1433
- }
1434
- case GGML_OP_DUP:
1435
- case GGML_OP_CPY:
1436
- case GGML_OP_CONT:
1437
- switch (op->src[0]->type) {
1438
- case GGML_TYPE_F32:
1439
- case GGML_TYPE_F16:
1440
- break;
1441
- default:
1442
- return false;
1443
- }
1444
- switch (op->type) {
1445
- case GGML_TYPE_F32:
1446
- case GGML_TYPE_F16:
1447
- break;
1448
- default:
1449
- return false;
1450
- }
1451
- return true;
1452
- case GGML_OP_DIAG_MASK_INF:
1453
- return op->ne[3] == 1;
1454
- case GGML_OP_GET_ROWS:
1455
- switch (op->src[0]->type) {
1456
- case GGML_TYPE_F32:
1457
- case GGML_TYPE_F16:
1458
- case GGML_TYPE_Q4_0:
1459
- case GGML_TYPE_Q4_1:
1460
- case GGML_TYPE_Q6_K:
1461
- return op->ne[2] == 1 && op->ne[3] == 1;
1462
- default:
1463
- ;
1464
- }
1465
- return false;
1466
- case GGML_OP_MUL_MAT:
1467
- if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1]))
1468
- return false;
1469
-
1470
- switch (op->src[0]->type) {
1471
- case GGML_TYPE_F32:
1472
- return op->ne[3] == 1;
1473
- case GGML_TYPE_Q6_K:
1474
- case GGML_TYPE_F16:
1475
- case GGML_TYPE_Q8_0:
1476
- case GGML_TYPE_Q4_0:
1477
- case GGML_TYPE_Q4_1:
1478
- case GGML_TYPE_Q4_K:
1479
- return true;
1480
- default:
1481
- ;
1482
- }
1483
- default:
1484
- ;
1485
- }
1486
- return false;
1487
-
1488
- GGML_UNUSED(dev);
1489
- }
1490
-
1491
- static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
1492
- const int n_seq = 8;
1493
-
1494
- // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
1495
- // it to the size of the graph, but I think it can be made smaller?
1496
- ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes);
1497
-
1498
- std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
1499
-
1500
- for (auto& sequence : sequences) {
1501
- sequence = komputeManager()->sequence();
1502
- }
1503
- for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
1504
- const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
1505
-
1506
- auto& seq = *sequences[seq_idx];
1507
-
1508
- const int node_start = (seq_idx + 0) * n_nodes_per_seq;
1509
- const int node_end = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes);
1510
-
1511
- bool any_commands_recorded = false;
1512
-
1513
- for (int i = node_start; i < node_end; ++i) {
1514
- struct ggml_tensor * src0 = gf->nodes[i]->src[0];
1515
- struct ggml_tensor * src1 = gf->nodes[i]->src[1];
1516
- struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2);
1517
- struct ggml_tensor * dst = gf->nodes[i];
1518
- GGML_ASSERT(dst->data != nullptr);
1519
-
1520
- if (ggml_is_empty(dst)) {
1521
- continue;
1522
- }
1523
-
1524
- switch (dst->op) {
1525
- case GGML_OP_NONE:
1526
- case GGML_OP_RESHAPE:
1527
- case GGML_OP_VIEW:
1528
- case GGML_OP_TRANSPOSE:
1529
- case GGML_OP_PERMUTE:
1530
- continue; // noop -> next node
1531
- default:
1532
- break;
1533
- }
1534
-
1535
- any_commands_recorded = true;
1536
-
1537
- const int32_t ne00 = src0 ? src0->ne[0] : 0;
1538
- const int32_t ne01 = src0 ? src0->ne[1] : 0;
1539
- const int32_t ne02 = src0 ? src0->ne[2] : 0;
1540
- const int32_t ne03 = src0 ? src0->ne[3] : 0;
1541
-
1542
- const uint32_t nb00 = src0 ? src0->nb[0] : 0;
1543
- const uint32_t nb01 = src0 ? src0->nb[1] : 0;
1544
- const uint32_t nb02 = src0 ? src0->nb[2] : 0;
1545
- const uint32_t nb03 = src0 ? src0->nb[3] : 0;
1546
-
1547
- const int32_t ne10 = src1 ? src1->ne[0] : 0;
1548
- const int32_t ne11 = src1 ? src1->ne[1] : 0;
1549
- const int32_t ne12 = src1 ? src1->ne[2] : 0;
1550
- const int32_t ne13 = src1 ? src1->ne[3] : 0;
1551
-
1552
- const uint32_t nb10 = src1 ? src1->nb[0] : 0;
1553
- const uint32_t nb11 = src1 ? src1->nb[1] : 0;
1554
- const uint32_t nb12 = src1 ? src1->nb[2] : 0;
1555
- const uint32_t nb13 = src1 ? src1->nb[3] : 0;
1556
-
1557
- const int32_t ne0 = dst ? dst->ne[0] : 0;
1558
- const int32_t ne1 = dst ? dst->ne[1] : 0;
1559
- const int32_t ne2 = dst ? dst->ne[2] : 0;
1560
- // const int32_t ne3 = dst ? dst->ne[3] : 0;
1561
-
1562
- const uint32_t nb0 = dst ? dst->nb[0] : 0;
1563
- const uint32_t nb1 = dst ? dst->nb[1] : 0;
1564
- const uint32_t nb2 = dst ? dst->nb[2] : 0;
1565
- const uint32_t nb3 = dst ? dst->nb[3] : 0;
1566
-
1567
- const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
1568
- const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
1569
- const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
1570
-
1571
- const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
1572
- uint32_t off_src0 = 0;
1573
- uint32_t off_src1 = 0;
1574
- uint32_t off_src2 = 0;
1575
- uint32_t off_dst = 0;
1576
- const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
1577
- const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
1578
- const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
1579
- const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
1580
-
1581
- switch (dst->op) {
1582
- case GGML_OP_ADD:
1583
- {
1584
- if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
1585
- // src1 is a row
1586
- ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
1587
- } else {
1588
- ggml_vk_add(
1589
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1590
- ne00, ne01, ne02, ne03,
1591
- nb00, nb01, nb02, nb03,
1592
- ne10, ne11, ne12, ne13,
1593
- nb10, nb11, nb12, nb13,
1594
- ne0,
1595
- nb0, nb1, nb2, nb3
1596
- );
1597
- }
1598
- } break;
1599
- case GGML_OP_MUL:
1600
- {
1601
- ggml_vk_mul(
1602
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1603
- ne00, ne01, ne02, ne03,
1604
- nb00, nb01, nb02, nb03,
1605
- ne10, ne11, ne12, ne13,
1606
- nb10, nb11, nb12, nb13,
1607
- ne0,
1608
- nb0, nb1, nb2, nb3
1609
- );
1610
- } break;
1611
- case GGML_OP_SCALE:
1612
- {
1613
- float scale; memcpy(&scale, dst->op_params, sizeof(float));
1614
-
1615
- ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
1616
- } break;
1617
- case GGML_OP_UNARY:
1618
- {
1619
- int64_t n = ggml_nelements(dst);
1620
- GGML_ASSERT(n % 4 == 0);
1621
- switch (ggml_get_unary_op(gf->nodes[i])) {
1622
- case GGML_UNARY_OP_SILU:
1623
- {
1624
- ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
1625
- } break;
1626
- case GGML_UNARY_OP_RELU:
1627
- {
1628
- ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
1629
- } break;
1630
- case GGML_UNARY_OP_GELU:
1631
- {
1632
- GGML_ASSERT(n % 8 == 0);
1633
- ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8);
1634
- } break;
1635
- default:
1636
- {
1637
- fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1638
- GGML_ABORT("fatal error");
1639
- }
1640
- }
1641
- } break;
1642
- case GGML_OP_SOFT_MAX:
1643
- {
1644
- float scale;
1645
- float max_bias;
1646
-
1647
- memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
1648
- memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
1649
-
1650
- #pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
1651
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1652
- GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1653
-
1654
- const int64_t nrows_x = ggml_nrows(src0);
1655
- const int64_t nrows_y = src0->ne[1];
1656
-
1657
- const uint32_t n_head = nrows_x/nrows_y;
1658
- const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
1659
-
1660
- const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
1661
- const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1662
-
1663
- ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
1664
- } break;
1665
- case GGML_OP_DIAG_MASK_INF:
1666
- {
1667
- const int n_past = ((int32_t *)(dst->op_params))[0];
1668
- ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
1669
- } break;
1670
- case GGML_OP_NORM:
1671
- {
1672
- float eps;
1673
- memcpy(&eps, dst->op_params, sizeof(float));
1674
- ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
1675
- } break;
1676
- case GGML_OP_RMS_NORM:
1677
- {
1678
- GGML_ASSERT(ne00 % 4 == 0);
1679
-
1680
- float eps;
1681
- memcpy(&eps, dst->op_params, sizeof(float));
1682
- ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
1683
- } break;
1684
- case GGML_OP_MUL_MAT:
1685
- {
1686
- GGML_ASSERT(ne00 == ne10);
1687
-
1688
- GGML_ASSERT(ne12 % ne02 == 0);
1689
- GGML_ASSERT(ne13 % ne03 == 0);
1690
-
1691
- const uint32_t r2 = ne12/ne02;
1692
- const uint32_t r3 = ne13/ne03;
1693
-
1694
- if (src1t != GGML_TYPE_F32) {
1695
- fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
1696
- goto not_implemented;
1697
- }
1698
-
1699
- if (ggml_is_transposed(src0) ||
1700
- ggml_is_transposed(src1)) {
1701
- fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
1702
- goto not_implemented;
1703
- }
1704
-
1705
- switch (src0t) {
1706
- case GGML_TYPE_F32:
1707
- ggml_vk_mul_mat_mat_f32(
1708
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1709
- ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2
1710
- );
1711
- break;
1712
- case GGML_TYPE_F16:
1713
- ggml_vk_mul_mat_f16(
1714
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1715
- ne00, ne01, ne02, nb00, nb01, nb02, nb03,
1716
- ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
1717
- ne0, ne1, r2, r3
1718
- );
1719
- break;
1720
- case GGML_TYPE_Q8_0:
1721
- ggml_vk_mul_mat_q8_0(
1722
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1723
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1724
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1725
- );
1726
- break;
1727
- case GGML_TYPE_Q4_0:
1728
- ggml_vk_mul_mat_q4_0(
1729
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1730
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1731
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1732
- );
1733
- break;
1734
- case GGML_TYPE_Q4_1:
1735
- ggml_vk_mul_mat_q4_1(
1736
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1737
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1738
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1739
- );
1740
- break;
1741
- case GGML_TYPE_Q4_K:
1742
- ggml_vk_mul_mat_q4_k(
1743
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1744
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1745
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1746
- );
1747
- break;
1748
- case GGML_TYPE_Q6_K:
1749
- ggml_vk_mul_mat_q6_k(
1750
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1751
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1752
- nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1753
- );
1754
- break;
1755
- default: {
1756
- fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
1757
- goto not_implemented;
1758
- }
1759
- }
1760
-
1761
- } break;
1762
- case GGML_OP_GET_ROWS:
1763
- {
1764
- if (src0t == GGML_TYPE_F32) {
1765
- ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1766
- } else if (src0t == GGML_TYPE_F16) {
1767
- ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1768
- } else if (src0t == GGML_TYPE_Q4_0) {
1769
- ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1770
- } else if (src0t == GGML_TYPE_Q4_1) {
1771
- ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1772
- } else if (src0t == GGML_TYPE_Q6_K) {
1773
- ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
1774
- } else {
1775
- fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
1776
- goto not_implemented;
1777
- }
1778
- } break;
1779
- case GGML_OP_ROPE:
1780
- {
1781
- GGML_ASSERT(ne10 == ne02);
1782
- GGML_ASSERT(src0t == dstt);
1783
- // const int n_past = ((int32_t *) dst->op_params)[0];
1784
- const int n_dims = ((int32_t *) dst->op_params)[1];
1785
- const int mode = ((int32_t *) dst->op_params)[2];
1786
- // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
1787
- const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
1788
-
1789
- const bool has_freq_factors = dst->src[2] != nullptr;
1790
-
1791
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1792
- memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
1793
- memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
1794
- memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
1795
- memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
1796
- memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1797
- memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1798
- ggml_vk_rope(
1799
- seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
1800
- freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
1801
- ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
1802
- );
1803
- } break;
1804
- case GGML_OP_DUP:
1805
- case GGML_OP_CPY:
1806
- case GGML_OP_CONT:
1807
- {
1808
- switch (src0t) {
1809
- case GGML_TYPE_F32:
1810
- {
1811
- switch (dstt) {
1812
- case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
1813
- case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
1814
- default: goto not_implemented;
1815
- }
1816
- } break;
1817
- case GGML_TYPE_F16:
1818
- {
1819
- switch (dstt) {
1820
- case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
1821
- case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
1822
- default: goto not_implemented;
1823
- } break;
1824
- default: goto not_implemented;
1825
- }
1826
- }
1827
- } break;
1828
- default: goto not_implemented;
1829
- }
1830
- continue;
1831
- not_implemented: {}
1832
- fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
1833
- //GGML_ABORT("fatal error");
1834
- }
1835
-
1836
- // Evaluate sequence
1837
- if (any_commands_recorded) {
1838
- seq.evalAsync();
1839
- }
1840
- }
1841
-
1842
- // Wait for all sequences to finish
1843
- for (auto& sequence : sequences) {
1844
- if (sequence->isRunning())
1845
- sequence->evalAwait();
1846
- }
1847
-
1848
- ggml_vk_free_descriptor_pool(ctx);
1849
- }
1850
-
1851
- template<>
1852
- kp::Tensor::TensorDataTypes
1853
- kp::TensorT<half>::dataType()
1854
- {
1855
- return TensorDataTypes::eFloat;
1856
- }
1857
-
1858
- template<>
1859
- kp::Tensor::TensorDataTypes
1860
- kp::TensorT<uint8_t>::dataType()
1861
- {
1862
- return TensorDataTypes::eUnsignedInt;
1863
- }
1864
-
1865
- ////////////////////////////////////////////////////////////////////////////////
1866
-
1867
- // backend interface
1868
-
1869
- struct ggml_backend_kompute_buffer_type_context {
1870
- int device;
1871
- int device_ref = 0;
1872
- uint64_t buffer_alignment;
1873
- uint64_t max_alloc;
1874
- std::string name;
1875
-
1876
- ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc)
1877
- : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {}
1878
- };
1879
-
1880
- static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
1881
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1882
-
1883
- if (!ctx->device_ref) {
1884
- komputeManager()->initializeDevice(
1885
- ctx->device, {}, {
1886
- "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
1887
- "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
1888
- }
1889
- );
1890
- }
1891
-
1892
- assert(ggml_vk_has_device());
1893
- ctx->device_ref++;
1894
- }
1895
-
1896
- static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
1897
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1898
-
1899
- assert(ctx->device_ref > 0);
1900
-
1901
- ctx->device_ref--;
1902
-
1903
- if (!ctx->device_ref) {
1904
- komputeManager.destroy();
1905
- }
1906
- }
1907
-
1908
- static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1909
- auto * memory = (ggml_vk_memory *)buffer->context;
1910
- if (ggml_vk_has_device()) {
1911
- ggml_vk_free_memory(*memory);
1912
- }
1913
- delete memory;
1914
- }
1915
-
1916
- static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
1917
- return ((ggml_vk_memory *)buffer->context)->data;
1918
- }
1919
-
1920
- static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
1921
- GGML_UNUSED(buffer);
1922
-
1923
- const auto res = ggml_vk_get_tensor(tensor);
1924
- GGML_ASSERT(res);
1925
-
1926
- memcpy((char *)tensor->data + offset, data, size);
1927
-
1928
- komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
1929
- }
1930
-
1931
- static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
1932
- GGML_UNUSED(buffer);
1933
-
1934
- const auto res = ggml_vk_get_tensor(tensor);
1935
- GGML_ASSERT(res);
1936
-
1937
- komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
1938
-
1939
- memcpy(data, (const char *)tensor->data + offset, size);
1940
- }
1941
-
1942
- static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
1943
- auto * memory = (ggml_vk_memory *)buffer->context;
1944
- memset(memory->data, value, buffer->size);
1945
-
1946
- if (memory->stagingBuffer)
1947
- komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory->primaryBuffer, memory->stagingBuffer, memory->size);
1948
- }
1949
-
1950
- static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
1951
- /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
1952
- /* .get_base = */ ggml_backend_kompute_buffer_get_base,
1953
- /* .init_tensor = */ NULL,
1954
- /* .memset_tensor = */ NULL,
1955
- /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor,
1956
- /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor,
1957
- /* .cpy_tensor = */ NULL,
1958
- /* .clear = */ ggml_backend_kompute_buffer_clear,
1959
- /* .reset = */ NULL,
1960
- };
1961
-
1962
- // default buffer type
1963
-
1964
- static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
1965
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1966
- return ctx->name.c_str();
1967
- }
1968
-
1969
- static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
1970
- ggml_backend_kompute_device_ref(buft);
1971
- auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
1972
- return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
1973
- }
1974
-
1975
- static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
1976
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1977
- return ctx->buffer_alignment;
1978
- }
1979
-
1980
- static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
1981
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
1982
- return ctx->max_alloc;
1983
- }
1984
-
1985
- static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
1986
- /* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
1987
- /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
1988
- /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
1989
- /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
1990
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
1991
- /* .is_host = */ NULL,
1992
- };
1993
-
1994
- ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
1995
- static std::mutex mutex;
1996
- std::lock_guard<std::mutex> lock(mutex);
1997
-
1998
- auto devices = ggml_vk_available_devices();
1999
- int32_t device_count = (int32_t) devices.size();
2000
- GGML_ASSERT(device < device_count);
2001
- GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES);
2002
-
2003
- static ggml_backend_buffer_type
2004
- ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES];
2005
-
2006
- static bool ggml_backend_kompute_buffer_type_initialized = false;
2007
-
2008
- if (!ggml_backend_kompute_buffer_type_initialized) {
2009
- for (int32_t i = 0; i < device_count; i++) {
2010
- ggml_backend_kompute_buffer_types[i] = {
2011
- /* .iface = */ ggml_backend_kompute_buffer_type_interface,
2012
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i),
2013
- /* .context = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc },
2014
- };
2015
- }
2016
- ggml_backend_kompute_buffer_type_initialized = true;
2017
- }
2018
-
2019
- return &ggml_backend_kompute_buffer_types[device];
2020
- }
2021
-
2022
- // backend
2023
-
2024
- static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
2025
- auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
2026
- return ctx->name.c_str();
2027
- }
2028
-
2029
- static void ggml_backend_kompute_free(ggml_backend_t backend) {
2030
- auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
2031
-
2032
- assert(ctx == s_kompute_context);
2033
- s_kompute_context = nullptr;
2034
- if (ctx != nullptr) {
2035
- delete ctx;
2036
- }
2037
-
2038
- delete backend;
2039
- }
2040
-
2041
- static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
2042
- auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
2043
- ggml_vk_graph_compute(ctx, cgraph);
2044
- return GGML_STATUS_SUCCESS;
2045
- }
2046
-
2047
- static struct ggml_backend_i kompute_backend_i = {
2048
- /* .get_name = */ ggml_backend_kompute_name,
2049
- /* .free = */ ggml_backend_kompute_free,
2050
- /* .set_tensor_async = */ NULL,
2051
- /* .get_tensor_async = */ NULL,
2052
- /* .cpy_tensor_async = */ NULL,
2053
- /* .synchronize = */ NULL,
2054
- /* .graph_plan_create = */ NULL,
2055
- /* .graph_plan_free = */ NULL,
2056
- /* .graph_plan_update = */ NULL,
2057
- /* .graph_plan_compute = */ NULL,
2058
- /* .graph_compute = */ ggml_backend_kompute_graph_compute,
2059
- /* .event_record = */ NULL,
2060
- /* .event_wait = */ NULL,
2061
- };
2062
-
2063
- static ggml_guid_t ggml_backend_kompute_guid() {
2064
- static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49, 0xfb, 0x35, 0xfa, 0x9b, 0x18, 0x31, 0x1d, 0xca };
2065
- return &guid;
2066
- }
2067
-
2068
- ggml_backend_t ggml_backend_kompute_init(int device) {
2069
- GGML_ASSERT(s_kompute_context == nullptr);
2070
- s_kompute_context = new ggml_kompute_context(device);
2071
-
2072
- ggml_backend_t kompute_backend = new ggml_backend {
2073
- /* .guid = */ ggml_backend_kompute_guid(),
2074
- /* .interface = */ kompute_backend_i,
2075
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device),
2076
- /* .context = */ s_kompute_context,
2077
- };
2078
-
2079
- return kompute_backend;
2080
- }
2081
-
2082
- bool ggml_backend_is_kompute(ggml_backend_t backend) {
2083
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
2084
- }
2085
-
2086
- static size_t ggml_backend_kompute_get_device_count() {
2087
- auto devices = ggml_vk_available_devices();
2088
- return devices.size();
2089
- }
2090
-
2091
- static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) {
2092
- auto devices = ggml_vk_available_devices();
2093
- GGML_ASSERT((size_t) device < devices.size());
2094
- snprintf(description, description_size, "%s", devices[device].name);
2095
- }
2096
-
2097
- static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) {
2098
- auto devices = ggml_vk_available_devices();
2099
- GGML_ASSERT((size_t) device < devices.size());
2100
- *total = devices[device].heapSize;
2101
- *free = devices[device].heapSize;
2102
- }
2103
-
2104
- //////////////////////////
2105
-
2106
- struct ggml_backend_kompute_device_context {
2107
- int device;
2108
- std::string name;
2109
- std::string description;
2110
- };
2111
-
2112
- static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) {
2113
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
2114
- return ctx->name.c_str();
2115
- }
2116
-
2117
- static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) {
2118
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
2119
- return ctx->description.c_str();
2120
- }
2121
-
2122
- static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
2123
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
2124
- ggml_backend_kompute_get_device_memory(ctx->device, free, total);
2125
- }
2126
-
2127
- static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) {
2128
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
2129
- return ggml_backend_kompute_buffer_type(ctx->device);
2130
- }
2131
-
2132
- static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2133
- if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) {
2134
- return false;
2135
- }
2136
-
2137
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
2138
- ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context;
2139
-
2140
- return buft_ctx->device == ctx->device;
2141
- }
2142
-
2143
- static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) {
2144
- GGML_UNUSED(dev);
2145
- return GGML_BACKEND_DEVICE_TYPE_GPU;
2146
- }
2147
-
2148
- static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
2149
- props->name = ggml_backend_kompute_device_get_name(dev);
2150
- props->description = ggml_backend_kompute_device_get_description(dev);
2151
- props->type = ggml_backend_kompute_device_get_type(dev);
2152
- ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total);
2153
- props->caps = {
2154
- /* async = */ false,
2155
- /* host_buffer = */ false,
2156
- /* .buffer_from_host_ptr = */ false,
2157
- /* events = */ false,
2158
- };
2159
- }
2160
-
2161
- static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) {
2162
- GGML_UNUSED(params);
2163
- ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context;
2164
- return ggml_backend_kompute_init(ctx->device);
2165
- }
2166
-
2167
- static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
2168
- const int min_batch_size = 32;
2169
-
2170
- return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
2171
- (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
2172
-
2173
- GGML_UNUSED(dev);
2174
- }
2175
-
2176
- static const struct ggml_backend_device_i ggml_backend_kompute_device_i = {
2177
- /* .get_name = */ ggml_backend_kompute_device_get_name,
2178
- /* .get_description = */ ggml_backend_kompute_device_get_description,
2179
- /* .get_memory = */ ggml_backend_kompute_device_get_memory,
2180
- /* .get_type = */ ggml_backend_kompute_device_get_type,
2181
- /* .get_props = */ ggml_backend_kompute_device_get_props,
2182
- /* .init_backend = */ ggml_backend_kompute_device_init,
2183
- /* .get_buffer_type = */ ggml_backend_kompute_device_get_buffer_type,
2184
- /* .get_host_buffer_type = */ NULL,
2185
- /* .buffer_from_host_ptr = */ NULL,
2186
- /* .supports_op = */ ggml_backend_kompute_device_supports_op,
2187
- /* .supports_buft = */ ggml_backend_kompute_device_supports_buft,
2188
- /* .offload_op = */ ggml_backend_kompute_device_offload_op,
2189
- /* .event_new = */ NULL,
2190
- /* .event_free = */ NULL,
2191
- /* .event_synchronize = */ NULL,
2192
- };
2193
-
2194
- static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) {
2195
- GGML_UNUSED(reg);
2196
- return "Kompute";
2197
- }
2198
-
2199
- static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) {
2200
- GGML_UNUSED(reg);
2201
- return ggml_backend_kompute_get_device_count();
2202
- }
2203
-
2204
- static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) {
2205
- static std::vector<ggml_backend_dev_t> devices;
2206
-
2207
- static bool initialized = false;
2208
-
2209
- {
2210
- static std::mutex mutex;
2211
- std::lock_guard<std::mutex> lock(mutex);
2212
- if (!initialized) {
2213
- for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) {
2214
- ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context;
2215
- char desc[256];
2216
- ggml_backend_kompute_get_device_description(i, desc, sizeof(desc));
2217
- ctx->device = i;
2218
- ctx->name = "Kompute" + std::to_string(i);
2219
- ctx->description = desc;
2220
- devices.push_back(new ggml_backend_device {
2221
- /* .iface = */ ggml_backend_kompute_device_i,
2222
- /* .reg = */ reg,
2223
- /* .context = */ ctx,
2224
- });
2225
- }
2226
- initialized = true;
2227
- }
2228
- }
2229
-
2230
- GGML_ASSERT(device < devices.size());
2231
- return devices[device];
2232
- }
2233
-
2234
- static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = {
2235
- /* .get_name = */ ggml_backend_kompute_reg_get_name,
2236
- /* .get_device_count = */ ggml_backend_kompute_reg_get_device_count,
2237
- /* .get_device = */ ggml_backend_kompute_reg_get_device,
2238
- /* .get_proc_address = */ NULL,
2239
- };
2240
-
2241
- ggml_backend_reg_t ggml_backend_kompute_reg() {
2242
- static ggml_backend_reg reg = {
2243
- /* .api_version = */ GGML_BACKEND_API_VERSION,
2244
- /* .iface = */ ggml_backend_kompute_reg_i,
2245
- /* .context = */ nullptr,
2246
- };
2247
-
2248
- return &reg;
2249
- }
2250
-
2251
- GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/common.comp DELETED
@@ -1,112 +0,0 @@
1
- #extension GL_EXT_shader_16bit_storage: require
2
- #extension GL_EXT_shader_8bit_storage: require
3
- #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
4
- #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
5
- #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
6
- #extension GL_EXT_shader_explicit_arithmetic_types_int64: require
7
- #extension GL_EXT_control_flow_attributes: enable
8
- #extension GL_KHR_shader_subgroup_arithmetic : require
9
- #extension GL_EXT_debug_printf : enable
10
-
11
- #define QK4_0 32
12
- #define QK4_1 32
13
-
14
- #define GELU_COEF_A 0.044715
15
- #define SQRT_2_OVER_PI 0.79788456080286535587989211986876
16
- #define TWOPI_F 6.283185307179586f
17
-
18
- #define QK_K 256
19
- #define K_SCALE_SIZE 12
20
-
21
- #define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
22
- #define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
23
- #define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
24
- #define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
25
-
26
- #define sizeof_block_q4_0 0x12
27
- struct block_q4_0 {
28
- float16_t d;
29
- uint8_t qs[QK4_0 / 2];
30
- };
31
- mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
32
- const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
33
- const float d2 = d1 / 256.f;
34
- const float md = -8.f * xb.d;
35
- const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
36
- const uint16_t mask1 = mask0 << 8;
37
-
38
- mat4 reg;
39
- for (int i=0;i<8;i++) {
40
- uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
41
- reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
42
- reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
43
- }
44
- return reg;
45
- }
46
-
47
- #define sizeof_block_q4_1 0x14
48
- struct block_q4_1 {
49
- float16_t d;
50
- float16_t m;
51
- uint8_t qs[QK4_1 / 2];
52
- };
53
- mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
54
- const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
55
- const float d2 = d1 / 256.f;
56
- const float m = xb.m;
57
- const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
58
- const uint16_t mask1 = mask0 << 8;
59
-
60
- mat4 reg;
61
- for (int i=0;i<8;i++) {
62
- uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
63
- reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
64
- reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
65
- }
66
- return reg;
67
- }
68
-
69
- #define sizeof_block_q4_k 144
70
- struct block_q4_k {
71
- float16_t d;
72
- float16_t dmin;
73
- uint8_t scales[K_SCALE_SIZE];
74
- uint8_t qs[QK_K/2];
75
- };
76
-
77
- #define sizeof_block_q6_k 210
78
- struct block_q6_k {
79
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
80
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
81
- int8_t scales[QK_K/16]; // scales, quantized with 8 bits
82
- float16_t d; // super-block scale
83
- };
84
- mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
85
- const float16_t d_all = xb.d;
86
-
87
- const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
88
- const uint qhIndex = 32*(il/8) + 16*(il&1);
89
- float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
90
- il = (il/2) & 3;
91
-
92
- const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
93
- const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
94
- const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
95
- const float16_t ml = float16_t(d_all * sc * 32.f);
96
- const float16_t dl = float16_t(d_all * sc * coef);
97
- mat4 reg;
98
- for (int i = 0; i < 16; ++i) {
99
- const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
100
- : ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
101
- reg[i/4][i%4] = dl * q - ml;
102
- }
103
- return reg;
104
- }
105
-
106
-
107
- #define QK8_0 32
108
- // struct block_q8_0 {
109
- // float16_t d; // delta
110
- // int8_t qs[QK8_0]; // quants
111
- // };
112
- #define sizeof_block_q8_0 34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_add.comp DELETED
@@ -1,58 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1024) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
8
- layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
9
- layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
10
-
11
- layout(push_constant) uniform PushConstants {
12
- uint inAOff;
13
- uint inBOff;
14
- uint outOff;
15
- int ne00;
16
- int nb00;
17
- int nb01;
18
- int nb02;
19
- int nb03;
20
- int ne10;
21
- int ne11;
22
- int ne12;
23
- int ne13;
24
- int nb10;
25
- int nb11;
26
- int nb12;
27
- int nb13;
28
- int ne0;
29
- int nb0;
30
- int nb1;
31
- int nb2;
32
- int nb3;
33
- //int offs; // TODO: needed for GGML_OP_ACC, see metal code
34
- } pcs;
35
-
36
- // general-purpose kernel for addition of two tensors
37
- // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
38
- // cons: not very efficient
39
- void main() {
40
- const uint i03 = gl_WorkGroupID.z;
41
- const uint i02 = gl_WorkGroupID.y;
42
- const uint i01 = gl_WorkGroupID.x;
43
-
44
- const uint i13 = i03 % pcs.ne13;
45
- const uint i12 = i02 % pcs.ne12;
46
- const uint i11 = i01 % pcs.ne11;
47
-
48
- int offs = 0; // TMP (see above)
49
-
50
- uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
51
- uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4);
52
- uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4);
53
-
54
- for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
55
- const uint i10 = i0 % pcs.ne10;
56
- out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
57
- }
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp DELETED
@@ -1,25 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
8
- layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
9
- layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
10
-
11
- layout(push_constant) uniform PushConstants {
12
- uint inAOff;
13
- uint inBOff;
14
- uint outOff;
15
- uint row;
16
- } pcs;
17
-
18
- void main() {
19
- const uint baseIndex = gl_WorkGroupID.x * 4;
20
-
21
- for (uint x = 0; x < 4; x++) {
22
- const uint i = baseIndex + x;
23
- out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
24
- }
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define IN_TYPE float16_t
6
- #define IN_TYPE_SIZE 2
7
- #define OUT_TYPE float16_t
8
- #define OUT_TYPE_SIZE 2
9
-
10
- layout(local_size_x = 1024) in;
11
-
12
- layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
13
- layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inOff;
17
- uint outOff;
18
- int ne00;
19
- int ne01;
20
- int ne02;
21
- uint nb00;
22
- uint nb01;
23
- uint nb02;
24
- uint nb03;
25
- int ne0;
26
- int ne1;
27
- int ne2;
28
- uint nb0;
29
- uint nb1;
30
- uint nb2;
31
- uint nb3;
32
- } pcs;
33
-
34
- void main() {
35
- const uint i03 = gl_WorkGroupID.z;
36
- const uint i02 = gl_WorkGroupID.y;
37
- const uint i01 = gl_WorkGroupID.x;
38
-
39
- const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
40
-
41
- const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
42
- const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
43
- const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
44
- const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
45
-
46
- const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
47
-
48
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
49
- const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
50
- out_[dst_data+i00] = OUT_TYPE(in_[src]);
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define IN_TYPE float16_t
6
- #define IN_TYPE_SIZE 2
7
- #define OUT_TYPE float
8
- #define OUT_TYPE_SIZE 4
9
-
10
- layout(local_size_x = 1024) in;
11
-
12
- layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
13
- layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inOff;
17
- uint outOff;
18
- int ne00;
19
- int ne01;
20
- int ne02;
21
- uint nb00;
22
- uint nb01;
23
- uint nb02;
24
- uint nb03;
25
- int ne0;
26
- int ne1;
27
- int ne2;
28
- uint nb0;
29
- uint nb1;
30
- uint nb2;
31
- uint nb3;
32
- } pcs;
33
-
34
- void main() {
35
- const uint i03 = gl_WorkGroupID.z;
36
- const uint i02 = gl_WorkGroupID.y;
37
- const uint i01 = gl_WorkGroupID.x;
38
-
39
- const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
40
-
41
- const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
42
- const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
43
- const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
44
- const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
45
-
46
- const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
47
-
48
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
49
- const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
50
- out_[dst_data+i00] = OUT_TYPE(in_[src]);
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define IN_TYPE float
6
- #define IN_TYPE_SIZE 4
7
- #define OUT_TYPE float16_t
8
- #define OUT_TYPE_SIZE 2
9
-
10
- layout(local_size_x = 1024) in;
11
-
12
- layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
13
- layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inOff;
17
- uint outOff;
18
- int ne00;
19
- int ne01;
20
- int ne02;
21
- uint nb00;
22
- uint nb01;
23
- uint nb02;
24
- uint nb03;
25
- int ne0;
26
- int ne1;
27
- int ne2;
28
- uint nb0;
29
- uint nb1;
30
- uint nb2;
31
- uint nb3;
32
- } pcs;
33
-
34
- void main() {
35
- const uint i03 = gl_WorkGroupID.z;
36
- const uint i02 = gl_WorkGroupID.y;
37
- const uint i01 = gl_WorkGroupID.x;
38
-
39
- const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
40
-
41
- const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
42
- const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
43
- const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
44
- const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
45
-
46
- const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
47
-
48
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
49
- const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
50
- out_[dst_data+i00] = OUT_TYPE(in_[src]);
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define IN_TYPE float
6
- #define IN_TYPE_SIZE 4
7
- #define OUT_TYPE float
8
- #define OUT_TYPE_SIZE 4
9
-
10
- layout(local_size_x = 1024) in;
11
-
12
- layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
13
- layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inOff;
17
- uint outOff;
18
- int ne00;
19
- int ne01;
20
- int ne02;
21
- uint nb00;
22
- uint nb01;
23
- uint nb02;
24
- uint nb03;
25
- int ne0;
26
- int ne1;
27
- int ne2;
28
- uint nb0;
29
- uint nb1;
30
- uint nb2;
31
- uint nb3;
32
- } pcs;
33
-
34
- void main() {
35
- const uint i03 = gl_WorkGroupID.z;
36
- const uint i02 = gl_WorkGroupID.y;
37
- const uint i01 = gl_WorkGroupID.x;
38
-
39
- const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
40
-
41
- const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
42
- const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
43
- const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
44
- const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
45
-
46
- const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
47
-
48
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
49
- const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
50
- out_[dst_data+i00] = OUT_TYPE(in_[src]);
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp DELETED
@@ -1,30 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
8
- layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
9
-
10
- layout(push_constant) uniform PushConstants {
11
- uint inOff;
12
- uint outOff;
13
- uint n_past;
14
- int ne00;
15
- int ne01;
16
- } pcs;
17
-
18
- void main() {
19
- const uint i02 = gl_WorkGroupID.z;
20
- const uint i01 = gl_WorkGroupID.y;
21
- const uint i00 = gl_WorkGroupID.x;
22
-
23
- const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
24
-
25
- if (i00 > pcs.n_past + i01) {
26
- out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
27
- } else {
28
- out_[index + pcs.outOff] = in_[index + pcs.inOff];
29
- }
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp DELETED
@@ -1,22 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
8
- layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
9
- layout(push_constant) uniform PushConstants {
10
- uint inOff;
11
- uint outOff;
12
- } pcs;
13
-
14
- void main() {
15
- const uint baseIndex = gl_WorkGroupID.x * 8;
16
-
17
- for (uint x = 0; x < 8; x++) {
18
- const uint i = baseIndex + x;
19
- const float y = in_[i + pcs.inOff];
20
- out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
21
- }
22
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp DELETED
@@ -1,17 +0,0 @@
1
- void main() {
2
- const uint i = gl_WorkGroupID.x;
3
- const int r = inB[i + pcs.inBOff];
4
-
5
- int z = 0;
6
- for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
7
- const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
8
- const mat4 result = dequantize_block(inIndex, ind%NL);
9
- for (uint j = 0; j < 4; ++j) {
10
- for (uint k = 0; k < 4; ++k) {
11
- const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
12
- out_[outIndex] = result[j][k];
13
- ++z;
14
- }
15
- }
16
- }
17
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp DELETED
@@ -1,31 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
8
- layout (binding = 1) readonly buffer tensorInB { int inB[]; };
9
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
10
-
11
- layout (push_constant) uniform parameter {
12
- uint inAOff;
13
- uint inBOff;
14
- uint outOff;
15
- int ne00;
16
- int nb01;
17
- int nb1;
18
- } pcs;
19
-
20
- void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
21
- for (int j = 0; j < k; j++) {
22
- out_[y + j] = inA[x + j];
23
- }
24
- }
25
-
26
- void main() {
27
- const uint i = gl_WorkGroupID.x;
28
- const int r = inB[i + pcs.inBOff];
29
-
30
- dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp DELETED
@@ -1,31 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout (binding = 0) readonly buffer tensorInA { float inA[]; };
8
- layout (binding = 1) readonly buffer tensorInB { int inB[]; };
9
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
10
-
11
- layout (push_constant) uniform parameter {
12
- uint inAOff;
13
- uint inBOff;
14
- uint outOff;
15
- int ne00;
16
- int nb01;
17
- int nb1;
18
- } pcs;
19
-
20
- void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
21
- for (int j = 0; j < k; j++) {
22
- out_[y + j] = inA[x + j];
23
- }
24
- }
25
-
26
- void main() {
27
- const uint i = gl_WorkGroupID.x;
28
- const int r = inB[i + pcs.inBOff];
29
-
30
- dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp DELETED
@@ -1,38 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define NL 2
6
- #define BYTES_FOR_TYPE 4 /*bytes for float*/
7
- #define SIZE_OF_BLOCK sizeof_block_q4_0
8
-
9
- layout(local_size_x = 1) in;
10
-
11
- layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12
- layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inAOff;
17
- uint inBOff;
18
- uint outOff;
19
- int ne00;
20
- int nb01;
21
- int nb1;
22
- } pcs;
23
-
24
- block_q4_0 get_unaligned_block_q4_0(uint index) {
25
- block_q4_0 fres;
26
- fres.d = u8BufToFloat16(inA, index);
27
- [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
28
- fres.qs[it] = inA[index+2+it];
29
- }
30
- return fres;
31
- }
32
-
33
- mat4 dequantize_block(uint index, uint il) {
34
- const block_q4_0 block = get_unaligned_block_q4_0(index);
35
- return dequantize_q4_0(block, il);
36
- }
37
-
38
- #include "op_getrows.comp"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp DELETED
@@ -1,39 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define NL 2
6
- #define BYTES_FOR_TYPE 4 /*bytes for float*/
7
- #define SIZE_OF_BLOCK sizeof_block_q4_1
8
-
9
- layout(local_size_x = 1) in;
10
-
11
- layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12
- layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inAOff;
17
- uint inBOff;
18
- uint outOff;
19
- int ne00;
20
- int nb01;
21
- int nb1;
22
- } pcs;
23
-
24
- block_q4_1 get_unaligned_block_q4_1(uint index) {
25
- block_q4_1 fres;
26
- fres.d = u8BufToFloat16(inA, index);
27
- fres.m = u8BufToFloat16(inA, index+2);
28
- [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
29
- fres.qs[it] = inA[index+4+it];
30
- }
31
- return fres;
32
- }
33
-
34
- mat4 dequantize_block(uint index, uint il) {
35
- const block_q4_1 block = get_unaligned_block_q4_1(index);
36
- return dequantize_q4_1(block, il);
37
- }
38
-
39
- #include "op_getrows.comp"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp DELETED
@@ -1,44 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define NL 16
6
- #define BYTES_FOR_TYPE 4 /*bytes for float*/
7
- #define SIZE_OF_BLOCK sizeof_block_q6_k
8
-
9
- layout(local_size_x = 1) in;
10
-
11
- layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12
- layout (binding = 1) readonly buffer tensorInB { int inB[]; };
13
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inAOff;
17
- uint inBOff;
18
- uint outOff;
19
- int ne00;
20
- int nb01;
21
- int nb1;
22
- } pcs;
23
-
24
- block_q6_k get_unaligned_block_q6_k(uint index) {
25
- block_q6_k fres;
26
- [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
27
- fres.ql[it] = inA[index + it];
28
- }
29
- [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
30
- fres.qh[it] = inA[index + QK_K/2 + it];
31
- }
32
- [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
33
- fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
34
- }
35
- fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
36
- return fres;
37
- }
38
-
39
- mat4 dequantize_block(uint index, uint il) {
40
- const block_q6_k block = get_unaligned_block_q6_k(index);
41
- return dequantize_q6_k(block, il);
42
- }
43
-
44
- #include "op_getrows.comp"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1024) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
8
- layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
9
- layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
10
-
11
- layout(push_constant) uniform PushConstants {
12
- uint inAOff;
13
- uint inBOff;
14
- uint outOff;
15
- int ne00;
16
- int nb00;
17
- int nb01;
18
- int nb02;
19
- int nb03;
20
- int ne10;
21
- int ne11;
22
- int ne12;
23
- int ne13;
24
- int nb10;
25
- int nb11;
26
- int nb12;
27
- int nb13;
28
- int ne0;
29
- int nb0;
30
- int nb1;
31
- int nb2;
32
- int nb3;
33
- } pcs;
34
-
35
- void main() {
36
- const uint i03 = gl_WorkGroupID.z;
37
- const uint i02 = gl_WorkGroupID.y;
38
- const uint i01 = gl_WorkGroupID.x;
39
-
40
- const uint i13 = i03 % pcs.ne13;
41
- const uint i12 = i02 % pcs.ne12;
42
- const uint i11 = i01 % pcs.ne11;
43
-
44
- uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
45
- uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
46
- uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
47
-
48
- for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
49
- const uint i10 = i0 % pcs.ne10;
50
- out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp DELETED
@@ -1,69 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #extension GL_KHR_shader_subgroup_arithmetic : require
6
-
7
- layout(local_size_x_id = 0) in;
8
-
9
- layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
10
- layout (binding = 1) readonly buffer tensorInB { float inB[]; };
11
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
12
-
13
- layout (push_constant) uniform parameter {
14
- uint inAOff;
15
- uint inBOff;
16
- uint outOff;
17
- int ne00;
18
- int ne01;
19
- int ne02;
20
- uint nb00;
21
- uint nb01;
22
- uint nb02;
23
- uint nb03;
24
- int ne10;
25
- int ne11;
26
- int ne12;
27
- uint nb10;
28
- uint nb11;
29
- uint nb12;
30
- uint nb13;
31
- int ne0;
32
- int ne1;
33
- uint r2;
34
- uint r3;
35
- } pcs;
36
-
37
- #define N_F16_F32 4
38
-
39
- void main() {
40
- const uint r0 = gl_WorkGroupID.x;
41
- const uint rb = gl_WorkGroupID.y*N_F16_F32;
42
- const uint im = gl_WorkGroupID.z;
43
-
44
- const uint i12 = im%pcs.ne12;
45
- const uint i13 = im/pcs.ne12;
46
-
47
- const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
48
-
49
- const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
50
-
51
- for (uint row = 0; row < N_F16_F32; ++row) {
52
- uint r1 = rb + row;
53
- if (r1 >= pcs.ne11) {
54
- break;
55
- }
56
-
57
- const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
58
-
59
- float sumf = 0;
60
- for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
61
- sumf += float(inA[x+i]) * float(inB[y+i]);
62
- }
63
-
64
- const float all_sum = subgroupAdd(sumf);
65
- if (subgroupElect()) {
66
- out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
67
- }
68
- }
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp DELETED
@@ -1,51 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #extension GL_KHR_shader_subgroup_arithmetic : require
6
- #extension GL_EXT_debug_printf : enable
7
-
8
- // device subgroup size
9
- layout (local_size_x_id = 0) in;
10
-
11
- layout(binding = 0) readonly buffer tensorInA { float inA[]; };
12
- layout(binding = 1) readonly buffer tensorInB { float inB[]; };
13
- layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
14
-
15
- layout(push_constant) uniform parameter {
16
- uint inAOff;
17
- uint inBOff;
18
- uint outOff;
19
- int ne00;
20
- int ne01;
21
- int ne02;
22
- int ne11;
23
- int ne12;
24
- uint nb01;
25
- uint nb02;
26
- uint nb11;
27
- uint nb12;
28
- uint nb1;
29
- uint nb2;
30
- }
31
- pcs;
32
-
33
-
34
- void main() {
35
- uvec3 gid = gl_WorkGroupID;
36
-
37
- uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
38
- uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
39
-
40
- const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
41
- const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
42
- float sum = 0.0f;
43
- for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
44
- sum += float(inA[x+i]) * float(inB[y+i]);
45
- }
46
-
47
- const float all_sum = subgroupAdd(sum);
48
- if (subgroupElect()) {
49
- out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
50
- }
51
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp DELETED
@@ -1,33 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define BLOCKS_IN_QUANT QK4_0
6
- #define SIZE_OF_BLOCK sizeof_block_q4_0
7
- #define N_ROWS 4
8
-
9
- #include "op_mul_mv_q_n_pre.comp"
10
-
11
- // The q4_0 version of this function
12
- float block_q_n_dot_y(uint block_index, uint yb, uint il) {
13
- vec2 acc = vec2(0.0, 0.0);
14
- const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
15
- float d = float(u8BufToFloat16(inA, index));
16
- float sumy = 0.0f;
17
- for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
18
- const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
19
-
20
- const float yl0 = inB[yb + i];
21
- const float yl1 = inB[yb + i + 1];
22
- const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
23
- const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
24
-
25
- sumy += yl0 + yl1 + yl8 + yl9;
26
-
27
- acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
28
- acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
29
- }
30
- return d * (sumy * -8.f + acc[0] + acc[1]);
31
- }
32
-
33
- #include "op_mul_mv_q_n.comp"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp DELETED
@@ -1,35 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define BLOCKS_IN_QUANT QK4_1
6
- #define SIZE_OF_BLOCK sizeof_block_q4_1
7
- #define N_ROWS 4
8
-
9
- #include "op_mul_mv_q_n_pre.comp"
10
-
11
- // The q4_1 version of this function
12
- float block_q_n_dot_y(uint block_index, uint yb, uint il) {
13
- vec2 acc = vec2(0.0, 0.0);
14
- const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
15
- float d = float(u8BufToFloat16(inA, index));
16
- float m = float(u8BufToFloat16(inA, index+2));
17
-
18
- float sumy = 0.0f;
19
- for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
20
- const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
21
-
22
- const float yl0 = inB[yb + i];
23
- const float yl1 = inB[yb + i + 1];
24
- const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
25
- const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
26
-
27
- sumy += yl0 + yl1 + yl8 + yl9;
28
-
29
- acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
30
- acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
31
- }
32
- return d * (acc[0] + acc[1]) + sumy * m;
33
- }
34
-
35
- #include "op_mul_mv_q_n.comp"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp DELETED
@@ -1,140 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define N_DST 4
6
- #define SIZE_OF_BLOCK sizeof_block_q4_k
7
-
8
- layout(local_size_x = 4) in;
9
- layout(local_size_y = 8) in;
10
- layout(local_size_z = 1) in;
11
-
12
- layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
13
- layout (binding = 1) readonly buffer tensorInB { float inB[]; };
14
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
15
-
16
- layout (push_constant) uniform parameter {
17
- uint inAOff;
18
- uint inBOff;
19
- uint outOff;
20
- int ne00;
21
- int ne10;
22
- int ne0;
23
- int ne1;
24
- int ne01;
25
- int ne02;
26
- int ne12;
27
- uint nb01;
28
- uint nb02;
29
- uint nb03;
30
- uint nb11;
31
- uint nb12;
32
- uint nb13;
33
- uint r2;
34
- uint r3;
35
- } pcs;
36
-
37
- void main() {
38
- const uint16_t kmask1 = uint16_t(0x3f3f);
39
- const uint16_t kmask2 = uint16_t(0x0f0f);
40
- const uint16_t kmask3 = uint16_t(0xc0c0);
41
-
42
- const uint ix = gl_SubgroupInvocationID/8; // 0...3
43
- const uint it = gl_SubgroupInvocationID%8; // 0...7
44
- const uint iq = it/4; // 0 or 1
45
- const uint ir = it%4; // 0...3
46
-
47
- const uint nb = pcs.ne00/QK_K;
48
-
49
- const uint r0 = gl_WorkGroupID.x;
50
- const uint r1 = gl_WorkGroupID.y;
51
- const uint im = gl_WorkGroupID.z;
52
-
53
- const uint first_row = r0 * N_DST;
54
- const uint ib_row = first_row * nb;
55
-
56
- const uint i12 = im%pcs.ne12;
57
- const uint i13 = im/pcs.ne12;
58
-
59
- const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
60
- const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
61
-
62
- const uint xblk = offset0 + pcs.inAOff;
63
- const uint y = (offset1 / 4) + pcs.inBOff;
64
-
65
- float yl[16];
66
- float yh[16];
67
- float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
68
- float all_sum = 0.f;
69
-
70
- uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
71
-
72
- for (uint ib = ix; ib < nb; ib += 4) {
73
- const uint blk_idx = ib + xblk;
74
-
75
- float sumy[4] = {0.f, 0.f, 0.f, 0.f};
76
- for (int i = 0; i < 8; ++i) {
77
- yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0];
78
- yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
79
- yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
80
- yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
81
- }
82
-
83
- for (int row = 0; row < N_DST; row++) {
84
- uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
85
-
86
- uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
87
- uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
88
- uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
89
- uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
90
- uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
91
-
92
- uint16_t sc16[4];
93
- sc16[0] = sc_0 & kmask1;
94
- sc16[1] = sc_2 & kmask1;
95
- sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
96
- sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
97
-
98
- float acc1[4] = {0.f, 0.f, 0.f, 0.f};
99
- float acc2[4] = {0.f, 0.f, 0.f, 0.f};
100
- for (int i = 0; i < 8; i += 2) {
101
- uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
102
- uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
103
- acc1[0] += yl[i+0] * (q1 & 0x000F);
104
- acc1[1] += yl[i+1] * (q1 & 0x0F00);
105
- acc1[2] += yl[i+8] * (q1 & 0x00F0);
106
- acc1[3] += yl[i+9] * (q1 & 0xF000);
107
- acc2[0] += yh[i+0] * (q2 & 0x000F);
108
- acc2[1] += yh[i+1] * (q2 & 0x0F00);
109
- acc2[2] += yh[i+8] * (q2 & 0x00F0);
110
- acc2[3] += yh[i+9] * (q2 & 0xF000);
111
- }
112
-
113
- uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
114
- uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
115
- uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
116
- uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
117
- uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
118
- uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
119
- uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
120
- uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
121
-
122
- float dall = float(inA[blk_idx + row_idx].d);
123
- float dmin = float(inA[blk_idx + row_idx].dmin);
124
- sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
125
- (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
126
- (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
127
- (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
128
- dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
129
- }
130
-
131
- y4 += 4 * QK_K;
132
- }
133
-
134
- for (int row = 0; row < N_DST; ++row) {
135
- all_sum = subgroupAdd(sumf[row]);
136
- if (subgroupElect()) {
137
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
138
- }
139
- }
140
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp DELETED
@@ -1,106 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #define SIZE_OF_BLOCK sizeof_block_q6_k
6
-
7
- layout(local_size_x_id = 0) in;
8
- layout(local_size_y_id = 1) in;
9
- layout(local_size_z = 1) in;
10
-
11
- layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
12
- layout (binding = 1) readonly buffer tensorInB { float inB[]; };
13
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
14
-
15
- layout (push_constant) uniform parameter {
16
- uint inAOff;
17
- uint inBOff;
18
- uint outOff;
19
- int ne00;
20
- int ne10;
21
- int ne0;
22
- int ne1;
23
- int ne01;
24
- int ne02;
25
- int ne12;
26
- uint nb01;
27
- uint nb02;
28
- uint nb03;
29
- uint nb11;
30
- uint nb12;
31
- uint nb13;
32
- uint r2;
33
- uint r3;
34
- } pcs;
35
-
36
- void main() {
37
- const uint8_t kmask1 = uint8_t(0x03);
38
- const uint8_t kmask2 = uint8_t(0x0C);
39
- const uint8_t kmask3 = uint8_t(0x30);
40
- const uint8_t kmask4 = uint8_t(0xC0);
41
-
42
- const uint nb = pcs.ne00/QK_K;
43
-
44
- const uint r0 = gl_WorkGroupID.x;
45
- const uint r1 = gl_WorkGroupID.y;
46
- const uint im = gl_WorkGroupID.z;
47
-
48
- const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
49
-
50
- const uint i12 = im%pcs.ne12;
51
- const uint i13 = im/pcs.ne12;
52
-
53
- const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
54
- const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
55
-
56
- float sumf = 0;
57
-
58
- // bits of invocation ID for gl_SubgroupSize=32:
59
- // x x x x x
60
- // 4 3 2 1 0
61
- // ( tid ) ix
62
- // ip ( il )
63
-
64
- const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
65
- const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
66
- const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
67
- const uint ip = tid/8; // first or second half of block (0 or 1)
68
- const uint il = tid%8; // each half has 8 parts, one per scale
69
- const uint n = 4; // 4 scales at a time (and 4 sums)
70
- const uint l0 = n*il; // offset into half-block, 0..28
71
- const uint is = 8*ip + l0/16; // 0, 1, 8, 9
72
-
73
- const uint y_offset = 128*ip + l0;
74
- const uint q_offset_l = 64*ip + l0;
75
- const uint q_offset_h = 32*ip + l0;
76
-
77
- for (uint i = ix; i < nb; i += block_stride) {
78
-
79
- const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
80
-
81
- const uint qlIndex = q_offset_l;
82
- const uint q2Index = qlIndex + QK_K/8;
83
- const uint qhIndex = q_offset_h;
84
- const uint y = yy + i * QK_K + y_offset;
85
-
86
- float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
87
- for (uint l = 0; l < n; ++l) {
88
- const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
89
- const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
90
- const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
91
-
92
- sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
93
- sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
94
- sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
95
- sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
96
- }
97
-
98
- float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
99
- sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
100
- }
101
-
102
- const float tot = subgroupAdd(sumf);
103
- if (subgroupElect()) {
104
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
105
- }
106
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp DELETED
@@ -1,73 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- #include "op_mul_mv_q_n_pre.comp"
6
-
7
- #define SIZE_OF_D 2
8
-
9
- #define N_DST 4 // each SIMD group works on 4 rows
10
- #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
11
- #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
12
-
13
- #define NB_Q8_0 8
14
-
15
- void main() {
16
- // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
17
- if (gl_SubgroupInvocationID > 31)
18
- return;
19
-
20
- const int nr = N_DST;
21
- const int nsg = N_SIMDGROUP;
22
- const int nw = N_SIMDWIDTH;
23
-
24
- const int nb = pcs.ne00/QK8_0;
25
- const uint r0 = gl_WorkGroupID.x;
26
- const uint r1 = gl_WorkGroupID.y;
27
- const uint im = gl_WorkGroupID.z;
28
-
29
- const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
30
-
31
- const uint i12 = im%pcs.ne12;
32
- const uint i13 = im/pcs.ne12;
33
-
34
- const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
35
-
36
- const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
37
- const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
38
-
39
- float yl[NB_Q8_0];
40
- float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
41
-
42
- const uint ix = gl_SubgroupInvocationID.x/4;
43
- const uint il = gl_SubgroupInvocationID.x%4;
44
-
45
- uint yb = y + ix * QK8_0 + NB_Q8_0*il;
46
-
47
- // each thread in a SIMD group deals with NB_Q8_0 quants at a time
48
- for (uint ib = ix; ib < nb; ib += nw/4) {
49
- for (int i = 0; i < NB_Q8_0; ++i) {
50
- yl[i] = inB[yb + i];
51
- }
52
-
53
- for (int row = 0; row < nr; row++) {
54
- const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
55
- float sumq = 0.f;
56
- for (int iq = 0; iq < NB_Q8_0; ++iq) {
57
- const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
58
- sumq += qs_iq * yl[iq];
59
- }
60
- const float16_t d = u8BufToFloat16(inA, x + block_offset);
61
- sumf[row] += sumq*d;
62
- }
63
-
64
- yb += NB_Q8_0 * nw;
65
- }
66
-
67
- for (int row = 0; row < nr; ++row) {
68
- const float tot = subgroupAdd(sumf[row]);
69
- if (subgroupElect() && first_row + row < pcs.ne01) {
70
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
71
- }
72
- }
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp DELETED
@@ -1,52 +0,0 @@
1
- void main() {
2
- // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
3
- if (gl_SubgroupInvocationID > 31)
4
- return;
5
-
6
- const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
7
-
8
- const uint r0 = gl_WorkGroupID.x;
9
- const uint r1 = gl_WorkGroupID.y;
10
- const uint im = gl_WorkGroupID.z;
11
-
12
- const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
13
-
14
- const uint i12 = im%pcs.ne12;
15
- const uint i13 = im/pcs.ne12;
16
-
17
- // pointers to src0 rows
18
- uint ax[N_ROWS];
19
- for (int row = 0; row < N_ROWS; ++row) {
20
- const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
21
-
22
- ax[row] = offset0 + pcs.inAOff;
23
- }
24
-
25
- const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
26
-
27
- float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
28
-
29
- const uint ix = gl_SubgroupInvocationID/2;
30
- const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
31
-
32
- uint yb = y + ix * BLOCKS_IN_QUANT + il;
33
-
34
- //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
35
- // gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
36
- // gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
37
-
38
- for (uint ib = ix; ib < nb; ib += 16) {
39
- for (int row = 0; row < N_ROWS; row++) {
40
- sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
41
- }
42
-
43
- yb += BLOCKS_IN_QUANT * 16;
44
- }
45
-
46
- for (int row = 0; row < N_ROWS; ++row) {
47
- const float tot = subgroupAdd(sumf[row]);
48
- if (first_row + row < pcs.ne01 && subgroupElect()) {
49
- out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
50
- }
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp DELETED
@@ -1,28 +0,0 @@
1
- layout(local_size_x_id = 0) in;
2
- layout(local_size_y = 8) in;
3
- layout(local_size_z = 1) in;
4
-
5
- layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
6
- layout (binding = 1) readonly buffer tensorInB { float inB[]; };
7
- layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
8
-
9
- layout (push_constant) uniform parameter {
10
- uint inAOff;
11
- uint inBOff;
12
- uint outOff;
13
- int ne00;
14
- int ne01;
15
- int ne02;
16
- int ne10;
17
- int ne12;
18
- int ne0;
19
- int ne1;
20
- uint nb01;
21
- uint nb02;
22
- uint nb03;
23
- uint nb11;
24
- uint nb12;
25
- uint nb13;
26
- uint r2;
27
- uint r3;
28
- } pcs;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_norm.comp DELETED
@@ -1,84 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 256) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
8
- layout(binding = 1) buffer restrict tensorOut { float out_[]; };
9
-
10
- layout(push_constant) uniform PushConstants {
11
- uint inOff;
12
- uint outOff;
13
- uint ne00;
14
- uint nb01;
15
- float eps;
16
- } pcs;
17
-
18
- shared float sum[gl_WorkGroupSize.x];
19
-
20
- void main() {
21
- const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
22
- // MEAN
23
- // parallel sum
24
- sum[gl_LocalInvocationID.x] = 0.0;
25
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
26
- sum[gl_LocalInvocationID.x] += in_[x+i00];
27
- }
28
-
29
- // reduce
30
- barrier();
31
- memoryBarrierShared();
32
- [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
33
- if (gl_LocalInvocationID.x < i) {
34
- sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
35
- }
36
- barrier();
37
- memoryBarrierShared();
38
- }
39
-
40
- // broadcast
41
- if (gl_LocalInvocationID.x == 0) {
42
- sum[0] /= float(pcs.ne00);
43
- }
44
- barrier();
45
- memoryBarrierShared();
46
- const float mean = sum[0];
47
-
48
- // recenter
49
- const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
50
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
51
- out_[y+i00] = in_[x+i00] - mean;
52
- }
53
-
54
- // VARIANCE
55
- // parallel sum
56
- sum[gl_LocalInvocationID.x] = 0.0;
57
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
58
- sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
59
- }
60
-
61
- // reduce
62
- barrier();
63
- memoryBarrierShared();
64
- [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
65
- if (gl_LocalInvocationID.x < i) {
66
- sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
67
- }
68
- barrier();
69
- memoryBarrierShared();
70
- }
71
-
72
- // broadcast
73
- if (gl_LocalInvocationID.x == 0) {
74
- sum[0] /= float(pcs.ne00);
75
- }
76
- barrier();
77
- memoryBarrierShared();
78
- const float variance = sum[0];
79
-
80
- const float scale = 1.0f/sqrt(variance + pcs.eps);
81
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
82
- out_[y+i00] *= scale;
83
- }
84
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_relu.comp DELETED
@@ -1,21 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
8
- layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
9
- layout(push_constant) uniform PushConstants {
10
- uint inOff;
11
- uint outOff;
12
- } pcs;
13
-
14
- void main() {
15
- const uint baseIndex = gl_WorkGroupID.x * 4;
16
-
17
- for (uint x = 0; x < 4; x++) {
18
- const uint i = baseIndex + x;
19
- out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
20
- }
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp DELETED
@@ -1,53 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 512) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
8
- layout(binding = 1) buffer restrict tensorOut { float out_[]; };
9
-
10
- layout(push_constant) uniform PushConstants {
11
- uint inOff;
12
- uint outOff;
13
- uint ne00;
14
- uint nb01;
15
- float eps;
16
- } pcs;
17
-
18
- shared float sum[gl_WorkGroupSize.x];
19
-
20
- void main() {
21
- const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
22
-
23
- // parallel sum
24
- sum[gl_LocalInvocationID.x] = 0.0;
25
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
26
- sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
27
- }
28
-
29
- // reduce
30
- barrier();
31
- memoryBarrierShared();
32
- [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
33
- if (gl_LocalInvocationID.x < i) {
34
- sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
35
- }
36
- barrier();
37
- memoryBarrierShared();
38
- }
39
-
40
- // broadcast
41
- if (gl_LocalInvocationID.x == 0) {
42
- sum[0] /= float(pcs.ne00);
43
- }
44
- barrier();
45
- memoryBarrierShared();
46
-
47
- const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
48
-
49
- const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
50
- for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
51
- out_[y+i00] = in_[x+i00] * scale;
52
- }
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "rope_common.comp"
4
-
5
- layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
6
- layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
- layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
8
- layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
9
-
10
- void main() {
11
- const uint i3 = gl_WorkGroupID.z;
12
- const uint i2 = gl_WorkGroupID.y;
13
- const uint i1 = gl_WorkGroupID.x;
14
-
15
- float corr_dims[2];
16
- rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
17
-
18
- const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
19
-
20
- float theta_base = float(inB[pcs.inBOff + i2]);
21
- float inv_ndims = -1.f/pcs.n_dims;
22
-
23
- float cos_theta;
24
- float sin_theta;
25
-
26
- for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
27
- if (i0 < pcs.n_dims) {
28
- uint ic = i0/2;
29
-
30
- float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
31
-
32
- const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
33
-
34
- rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
35
-
36
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
37
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_
38
-
39
- const float x0 = float(inA[src]);
40
- const float x1 = float(inA[src+pcs.n_dims/2]);
41
-
42
- out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
43
- out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
44
- } else {
45
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
46
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
47
-
48
- out_[dst_data] = inA[src];
49
- out_[dst_data+1] = inA[src+1];
50
- }
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "rope_common.comp"
4
-
5
- layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
6
- layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
- layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
8
- layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
9
-
10
- void main() {
11
- const uint i3 = gl_WorkGroupID.z;
12
- const uint i2 = gl_WorkGroupID.y;
13
- const uint i1 = gl_WorkGroupID.x;
14
-
15
- float corr_dims[2];
16
- rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
17
-
18
- const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
19
-
20
- float theta_base = float(inB[pcs.inBOff + i2]);
21
- float inv_ndims = -1.f/pcs.n_dims;
22
-
23
- float cos_theta;
24
- float sin_theta;
25
-
26
- for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
27
- if (i0 < pcs.n_dims) {
28
- uint ic = i0/2;
29
-
30
- float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
31
-
32
- const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
33
-
34
- rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
35
-
36
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
37
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_
38
-
39
- const float x0 = inA[src];
40
- const float x1 = inA[src+pcs.n_dims/2];
41
-
42
- out_[dst_data] = x0*cos_theta - x1*sin_theta;
43
- out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
44
- } else {
45
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
46
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
47
-
48
- out_[dst_data] = inA[src];
49
- out_[dst_data+1] = inA[src+1];
50
- }
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "rope_common.comp"
4
-
5
- layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
6
- layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
- layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
8
- layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
9
-
10
- void main() {
11
- const uint i3 = gl_WorkGroupID.z;
12
- const uint i2 = gl_WorkGroupID.y;
13
- const uint i1 = gl_WorkGroupID.x;
14
-
15
- float corr_dims[2];
16
- rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
17
-
18
- const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
19
-
20
- float theta_base = float(inB[pcs.inBOff + i2]);
21
- float inv_ndims = -1.f/pcs.n_dims;
22
-
23
- float cos_theta;
24
- float sin_theta;
25
-
26
- for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
27
- if (i0 < pcs.n_dims) {
28
- uint ic = i0/2;
29
-
30
- float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
31
-
32
- const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
33
-
34
- rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
35
-
36
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
37
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
38
-
39
- const float x0 = float(inA[src]);
40
- const float x1 = float(inA[src+1]);
41
-
42
- out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
43
- out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
44
- } else {
45
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
46
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
47
-
48
- out_[dst_data] = inA[src];
49
- out_[dst_data+1] = inA[src+1];
50
- }
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp DELETED
@@ -1,52 +0,0 @@
1
- #version 450
2
-
3
- #include "rope_common.comp"
4
-
5
- layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
6
- layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
- layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
8
- layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
9
-
10
- void main() {
11
- const uint i3 = gl_WorkGroupID.z;
12
- const uint i2 = gl_WorkGroupID.y;
13
- const uint i1 = gl_WorkGroupID.x;
14
-
15
- float corr_dims[2];
16
- rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
17
-
18
- const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
19
-
20
- float theta_base = float(inB[pcs.inBOff + i2]);
21
- float inv_ndims = -1.f/pcs.n_dims;
22
-
23
- float cos_theta;
24
- float sin_theta;
25
-
26
- for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
27
- if (i0 < pcs.n_dims) {
28
- uint ic = i0/2;
29
-
30
- float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
31
-
32
- const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
33
-
34
- rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
35
-
36
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
37
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
38
-
39
- const float x0 = inA[src];
40
- const float x1 = inA[src+1];
41
-
42
- out_[dst_data] = x0*cos_theta - x1*sin_theta;
43
- out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
44
- } else {
45
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
46
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
47
-
48
- out_[dst_data] = inA[src];
49
- out_[dst_data+1] = inA[src+1];
50
- }
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_scale.comp DELETED
@@ -1,19 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
8
- layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
9
-
10
- layout(push_constant) uniform PushConstants {
11
- uint inOff;
12
- uint outOff;
13
- float scale;
14
- } pcs;
15
-
16
- void main() {
17
- const uint i = gl_WorkGroupID.x;
18
- out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp DELETED
@@ -1,23 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
8
- layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
9
-
10
- layout(push_constant) uniform PushConstants {
11
- uint inOff;
12
- uint outOff;
13
- float scale;
14
- } pcs;
15
-
16
- void main() {
17
- const uint baseIndex = gl_WorkGroupID.x * 8;
18
-
19
- for (uint x = 0; x < 8; x++) {
20
- const uint i = baseIndex + x;
21
- out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_silu.comp DELETED
@@ -1,22 +0,0 @@
1
- #version 450
2
-
3
- #include "common.comp"
4
-
5
- layout(local_size_x = 1) in;
6
-
7
- layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
8
- layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
9
- layout(push_constant) uniform PushConstants {
10
- uint inOff;
11
- uint outOff;
12
- } pcs;
13
-
14
- void main() {
15
- const uint baseIndex = gl_WorkGroupID.x * 4;
16
-
17
- for (uint x = 0; x < 4; x++) {
18
- const uint i = baseIndex + x;
19
- const float y = in_[i + pcs.inOff];
20
- out_[i + pcs.outOff] = y / (1.0 + exp(-y));
21
- }
22
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp DELETED
@@ -1,72 +0,0 @@
1
- // TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
2
-
3
- #version 450
4
-
5
- #include "common.comp"
6
-
7
- layout(local_size_x_id = 0) in;
8
-
9
- layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
10
- layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
11
- layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
12
-
13
- layout(push_constant) uniform PushConstants {
14
- uint inAOff;
15
- uint inBOff;
16
- uint outOff;
17
- int ne00;
18
- int ne01;
19
- int ne02;
20
- float scale;
21
- float max_bias;
22
- float m0;
23
- float m1;
24
- uint n_head_log2;
25
- int mask;
26
- } pcs;
27
-
28
- void main() {
29
- if (gl_SubgroupInvocationID > 31)
30
- return;
31
-
32
- const uint i03 = gl_WorkGroupID.z;
33
- const uint i02 = gl_WorkGroupID.y;
34
- const uint i01 = gl_WorkGroupID.x;
35
-
36
- const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
37
- const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
38
- const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
39
- const uint pdst = extra_off + pcs.outOff; // Based from out_
40
-
41
- float slope = 1.0f;
42
-
43
- // ALiBi
44
- if (pcs.max_bias > 0.0f) {
45
- int64_t h = i02;
46
-
47
- float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
48
- int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
49
-
50
- slope = pow(base, float(exp));
51
- }
52
-
53
- // parallel max
54
- float localMax = uintBitsToFloat(0xFF800000);
55
- for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
56
- localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
57
- }
58
- float max_ = subgroupMax(localMax);
59
-
60
- // parallel sum
61
- float localSum = 0.0f;
62
- for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
63
- const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
64
- localSum += exp_psrc0;
65
- out_[pdst + i00] = exp_psrc0;
66
- }
67
-
68
- const float sum = subgroupAdd(localSum);
69
- for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
70
- out_[pdst + i00] /= sum;
71
- }
72
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml/src/ggml-kompute/kompute-shaders/rope_common.comp DELETED
@@ -1,71 +0,0 @@
1
- #include "common.comp"
2
-
3
- #define GGML_ROPE_TYPE_NEOX 2
4
-
5
- // TODO: use a local size of 32 or more (Metal uses 1024)
6
- layout(local_size_x = 1) in;
7
-
8
- layout (push_constant) uniform parameter {
9
- uint inAOff;
10
- uint inBOff;
11
- uint inCOff;
12
- uint outOff;
13
- int n_dims;
14
- int mode;
15
- int n_ctx_orig;
16
- float freq_base;
17
- float freq_scale;
18
- bool has_freq_factors;
19
- float ext_factor;
20
- float attn_factor;
21
- float beta_fast;
22
- float beta_slow;
23
- uint nb00;
24
- uint nb01;
25
- uint nb02;
26
- uint nb03;
27
- int ne0;
28
- uint nb0;
29
- uint nb1;
30
- uint nb2;
31
- uint nb3;
32
- } pcs;
33
-
34
- float rope_yarn_ramp(const float low, const float high, const float i0) {
35
- const float y = (i0 / 2 - low) / max(0.001f, high - low);
36
- return 1.0f - min(1.0f, max(0.0f, y));
37
- }
38
-
39
- // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
40
- // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
41
- void rope_yarn(
42
- float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
43
- out float cos_theta, out float sin_theta
44
- ) {
45
- // Get n-d rotational scaling corrected for extrapolation
46
- float theta_interp = freq_scale * theta_extrap;
47
- float theta = theta_interp;
48
- if (ext_factor != 0.0f) {
49
- float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
50
- theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
51
-
52
- // Get n-d magnitude scaling corrected for interpolation
53
- mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
54
- }
55
- cos_theta = cos(theta) * mscale;
56
- sin_theta = sin(theta) * mscale;
57
- }
58
-
59
- // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
60
- // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
61
- float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
62
- return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base));
63
- }
64
-
65
- void rope_yarn_corr_dims(
66
- int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2]
67
- ) {
68
- // start and end correction dims
69
- dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
70
- dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
71
- }