ggerganov commited on
Commit
d303fe3
·
1 Parent(s): f580c99

ggml : remove OpenCL (#0)

Browse files
CMakeLists.txt CHANGED
@@ -463,21 +463,6 @@ if (WHISPER_HIPBLAS)
463
  endif()
464
  endif()
465
 
466
- if (WHISPER_CLBLAST)
467
- find_package(CLBlast)
468
- if (CLBlast_FOUND)
469
- message(STATUS "CLBlast found")
470
-
471
- set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
472
-
473
- add_compile_definitions(GGML_USE_CLBLAST)
474
-
475
- set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
476
- else()
477
- message(FATAL_ERROR "CLBlast not found")
478
- endif()
479
- endif()
480
-
481
  if( WHISPER_OPENVINO )
482
  find_package(OpenVINO REQUIRED COMPONENTS Runtime)
483
  endif()
@@ -724,9 +709,8 @@ add_library(${TARGET}
724
  ggml-quants.c
725
  ${GGML_SOURCES_METAL}
726
  ${GGML_SOURCES_CUDA}
727
- ${GGML_SOURCES_OPENCL}
728
- ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
729
- ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
730
  whisper.h
731
  whisper.cpp
732
  )
 
463
  endif()
464
  endif()
465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
  if( WHISPER_OPENVINO )
467
  find_package(OpenVINO REQUIRED COMPONENTS Runtime)
468
  endif()
 
709
  ggml-quants.c
710
  ${GGML_SOURCES_METAL}
711
  ${GGML_SOURCES_CUDA}
712
+ ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
713
+ ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
 
714
  whisper.h
715
  whisper.cpp
716
  )
Makefile CHANGED
@@ -333,21 +333,6 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h
333
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
334
  endif
335
 
336
- ifdef WHISPER_CLBLAST
337
- CFLAGS += -DGGML_USE_CLBLAST
338
- CXXFLAGS += -DGGML_USE_CLBLAST
339
- LDFLAGS += -lclblast
340
- ifeq ($(UNAME_S),Darwin)
341
- LDFLAGS += -framework OpenCL
342
- else
343
- LDFLAGS += -lOpenCL
344
- endif
345
- WHISPER_OBJ += ggml-opencl.o
346
-
347
- ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
348
- $(CXX) $(CXXFLAGS) -c $< -o $@
349
- endif
350
-
351
  ifdef WHISPER_GPROF
352
  CFLAGS += -pg
353
  CXXFLAGS += -pg
 
333
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
334
  endif
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  ifdef WHISPER_GPROF
337
  CFLAGS += -pg
338
  CXXFLAGS += -pg
README.md CHANGED
@@ -20,7 +20,6 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
20
  - Zero memory allocations at runtime
21
  - Support for CPU-only inference
22
  - [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
23
- - [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
24
  - [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
25
  - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
26
 
@@ -422,28 +421,6 @@ make clean
422
  WHISPER_CUDA=1 make -j
423
  ```
424
 
425
- ## OpenCL GPU support via CLBlast
426
-
427
- For cards and integrated GPUs that support OpenCL, the Encoder processing can be largely offloaded to the GPU through CLBlast. This is especially useful for users with AMD APUs or low end devices for up to ~2x speedup.
428
-
429
- First, make sure you have installed `CLBlast` for your OS or Distribution: https://github.com/CNugteren/CLBlast
430
-
431
- Now build `whisper.cpp` with CLBlast support:
432
-
433
- ```
434
- Makefile:
435
- cd whisper.cpp
436
- make clean
437
- WHISPER_CLBLAST=1 make -j
438
-
439
- CMake:
440
- cd whisper.cpp
441
- cmake -B build -DWHISPER_CLBLAST=ON
442
- cmake --build build -j --config Release
443
- ```
444
-
445
- Run all the examples as usual.
446
-
447
  ## BLAS CPU support via OpenBLAS
448
 
449
  Encoder processing can be accelerated on the CPU via OpenBLAS.
 
20
  - Zero memory allocations at runtime
21
  - Support for CPU-only inference
22
  - [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 
23
  - [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
24
  - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
25
 
 
421
  WHISPER_CUDA=1 make -j
422
  ```
423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  ## BLAS CPU support via OpenBLAS
425
 
426
  Encoder processing can be accelerated on the CPU via OpenBLAS.
examples/whisper.android/README.md CHANGED
@@ -12,47 +12,3 @@ To use:
12
  (PS: Do not move this android project folder individually to other folders, because this android project folder depends on the files of the whole project.)
13
 
14
  <img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
15
-
16
- ## CLBlast
17
-
18
- > [!NOTE]
19
- > - OpenCL does not have the same level of support as CUDA or Metal.
20
- > - Turning on CLBlast may degrade OpenCL performance if your device isn't already tuned. See [tuning.md](https://github.com/CNugteren/CLBlast/blob/162783a414969464ce3aa5adf5c2554afa5ee93e/doc/tuning.md#already-tuned-for-devices) for a list of devices that are already tuned and what to do if yours is missing.
21
-
22
- Build CLBlast.
23
-
24
- ```
25
- # In path/to/CLBlast (we assume OpenCL-Headers relative location)
26
- $ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
27
- -DCMAKE_SYSTEM_NAME=Android \
28
- -DCMAKE_SYSTEM_VERSION=33 \
29
- -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
30
- -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
31
- -DCMAKE_ANDROID_STL_TYPE=c++_static \
32
- -DOPENCL_ROOT=$(readlink -f ../../OpenCL-Headers) \
33
- -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
34
- -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
35
-
36
- # Build libclblast.so
37
- make -j4
38
- ```
39
-
40
- Pull `libGLES_mali.so` to `libOpenCL.so`.
41
-
42
- ```bash
43
- # In path/to/whisper.android
44
- mkdir lib/src/main/jniLibs/arm64-v8a
45
- adb pull /system/vendor/lib64/egl/libGLES_mali.so lib/src/main/jniLibs/arm64-v8a/libOpenCL.so
46
- ```
47
-
48
- In gradle.properties, set `GGML_HOME` to the location of GGML, as well as
49
- required options for turning on CLBlast.
50
-
51
- ```
52
- GGML_HOME=/path/to/ggml
53
- GGML_CLBLAST=ON
54
- CLBLAST_HOME=/path/to/CLBlast
55
- OPENCL_LIB=/path/to/libOpenCL.so
56
- OPENCL_ROOT=/path/to/OpenCL-Headers
57
- ```
58
-
 
12
  (PS: Do not move this android project folder individually to other folders, because this android project folder depends on the files of the whole project.)
13
 
14
  <img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml-opencl.cpp DELETED
@@ -1,2305 +0,0 @@
1
- #include "ggml.h"
2
- #include "ggml-opencl.h"
3
- #include "ggml-backend-impl.h"
4
-
5
- #include <array>
6
- #include <atomic>
7
- #include <cstdio>
8
- #include <cstdlib>
9
- #include <cstring>
10
- #include <limits>
11
- #include <sstream>
12
- #include <vector>
13
-
14
- #define CL_TARGET_OPENCL_VERSION 120
15
- #include <clblast.h>
16
-
17
- #if defined(_MSC_VER)
18
- #pragma warning(disable: 4244 4267) // possible loss of data
19
- #endif
20
-
21
- #define CL_DMMV_LOCAL_SIZE 32
22
-
23
- #ifndef K_QUANTS_PER_ITERATION
24
- #define K_QUANTS_PER_ITERATION 1
25
- #else
26
- static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
27
- #endif
28
-
29
- #define MULTILINE_QUOTE(...) #__VA_ARGS__
30
- static std::string program_source = MULTILINE_QUOTE(
31
-
32
- typedef char int8_t;
33
- typedef uchar uint8_t;
34
- typedef short int16_t;
35
- typedef ushort uint16_t;
36
- typedef int int32_t;
37
- typedef uint uint32_t;
38
-
39
- struct __attribute__ ((packed)) block_q4_0
40
- {
41
- half d;
42
- uint8_t qs[QK4_0 / 2];
43
- };
44
-
45
- struct __attribute__ ((packed)) block_q4_1
46
- {
47
- half d;
48
- half m;
49
- uint8_t qs[QK4_1 / 2];
50
- };
51
-
52
- struct __attribute__ ((packed)) block_q5_0
53
- {
54
- half d;
55
- uint32_t qh;
56
- uint8_t qs[QK5_0 / 2];
57
- };
58
-
59
- struct __attribute__ ((packed)) block_q5_1
60
- {
61
- half d;
62
- half m;
63
- uint32_t qh;
64
- uint8_t qs[QK5_1 / 2];
65
- };
66
-
67
- struct __attribute__ ((packed)) block_q8_0
68
- {
69
- half d;
70
- int8_t qs[QK8_0];
71
- };
72
-
73
- struct __attribute__((packed)) block_q2_K
74
- {
75
- uint8_t scales[16];
76
- uint8_t qs[64];
77
- half d;
78
- half dmin;
79
- };
80
-
81
- struct __attribute__((packed)) block_q3_K
82
- {
83
- uint8_t hmask[32];
84
- uint8_t qs[64];
85
- uint8_t scales[12];
86
- half d;
87
- };
88
-
89
- struct __attribute__((packed)) block_q4_K
90
- {
91
- half d;
92
- half dmin;
93
- uint8_t scales[12];
94
- uint8_t qs[128];
95
- };
96
-
97
- struct __attribute__((packed)) block_q5_K
98
- {
99
- half d;
100
- half dmin;
101
- uint8_t scales[12];
102
- uint8_t qh[32];
103
- uint8_t qs[128];
104
- };
105
-
106
- struct __attribute__((packed)) block_q6_K
107
- {
108
- uint8_t ql[128];
109
- uint8_t qh[64];
110
- int8_t scales[16];
111
- half d;
112
- };
113
-
114
- __kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
115
- const uint i = get_global_id(0);
116
-
117
- y[i] = vload_half(0, &x[i]);
118
- }
119
-
120
- void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
121
- const float d = vload_half(0, &x[ib].d);
122
-
123
- const uint8_t vui = x[ib].qs[iqs];
124
-
125
- const int8_t vi0 = vui & 0xF;
126
- const int8_t vi1 = vui >> 4;
127
-
128
- *v0 = (vi0 - 8)*d;
129
- *v1 = (vi1 - 8)*d;
130
- }
131
- void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
132
- const float d = vload_half(0, &x[ib].d);
133
- const float m = vload_half(0, &x[ib].m);
134
-
135
- const uint8_t vui = x[ib].qs[iqs];
136
-
137
- const int8_t vi0 = vui & 0xF;
138
- const int8_t vi1 = vui >> 4;
139
-
140
- *v0 = vi0*d + m;
141
- *v1 = vi1*d + m;
142
- }
143
- void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
144
- const float d = vload_half(0, &x[ib].d);
145
-
146
- uint32_t qh = x[ib].qh;
147
-
148
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
149
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
150
-
151
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
152
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
153
-
154
- *v0 = x0*d;
155
- *v1 = x1*d;
156
- }
157
- void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
158
- const float d = vload_half(0, &x[ib].d);
159
- const float m = vload_half(0, &x[ib].m);
160
-
161
- uint32_t qh = x[ib].qh;
162
-
163
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
164
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
165
-
166
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
167
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
168
-
169
- *v0 = x0*d + m;
170
- *v1 = x1*d + m;
171
- }
172
- void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
173
- const float d = vload_half(0, &x[ib].d);
174
-
175
- const int8_t vi0 = x[ib].qs[iqs + 0];
176
- const int8_t vi1 = x[ib].qs[iqs + 1];
177
-
178
- *v0 = vi0*d;
179
- *v1 = vi1*d;
180
- }
181
- void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
182
- *v0 = vload_half(0, &x[ib + 0]);
183
- *v1 = vload_half(0, &x[ib + 1]);
184
- }
185
- );
186
-
187
- static std::string k_quants_source = MULTILINE_QUOTE(
188
- inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
189
- {
190
- if (j < 4)
191
- {
192
- *d = q[j] & 63;
193
- *m = q[j + 4] & 63;
194
- }
195
- else
196
- {
197
- *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
198
- *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
199
- }
200
- }
201
-
202
- __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
203
- {
204
- const int i = get_group_id(0) + get_global_offset(0);
205
- const int tid = get_local_id(0);
206
- const int n = tid / 32;
207
- const int l = tid - 32 * n;
208
- const int is = 8 * n + l / 16;
209
-
210
- const uint8_t q = x[i].qs[32 * n + l];
211
- __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
212
-
213
- const float dall = vload_half(0, &x[i].d);
214
- const float dmin = vload_half(0, &x[i].dmin);
215
-
216
- y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4);
217
- y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4);
218
- y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4);
219
- y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
220
- }
221
-
222
- __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
223
- {
224
- int r = get_local_id(0) / 4;
225
- int i = get_group_id(0) + get_global_offset(0);
226
- int tid = r / 2;
227
- int is0 = r % 2;
228
- int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
229
- int n = tid / 4;
230
- int j = tid - 4 * n;
231
-
232
- uint8_t m = 1 << (4 * n + j);
233
- int is = 8 * n + 2 * j + is0;
234
- int shift = 2 * j;
235
-
236
- int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4)
237
- : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
238
- : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
239
- : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
240
- float d_all = vload_half(0, &x[i].d);
241
- float dl = d_all * (us - 32);
242
-
243
- __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
244
- const __global uint8_t *q = x[i].qs + 32 * n;
245
- const __global uint8_t *hm = x[i].hmask;
246
-
247
- for (int l = l0; l < l0 + 4; ++l)
248
- y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
249
- }
250
-
251
- __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
252
- {
253
- const int i = get_group_id(0) + get_global_offset(0);
254
- const int tid = get_local_id(0);
255
- const int il = tid / 8;
256
- const int ir = tid % 8;
257
- const int is = 2 * il;
258
- const int n = 4;
259
-
260
- __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
261
-
262
- const float dall = vload_half(0, &x[i].d);
263
- const float dmin = vload_half(0, &x[i].dmin);
264
-
265
- __global const uint8_t *q = x[i].qs + 32 * il + n * ir;
266
-
267
- uint8_t sc, m;
268
- get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
269
- float d1 = dall * sc;
270
- float m1 = dmin * m;
271
- get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
272
- float d2 = dall * sc;
273
- float m2 = dmin * m;
274
- for (int l = 0; l < n; ++l)
275
- {
276
- y[l + 0] = d1 * (q[l] & 0xF) - m1;
277
- y[l + 32] = d2 * (q[l] >> 4) - m2;
278
- }
279
- }
280
-
281
- __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
282
- {
283
- const int i = get_group_id(0) + get_global_offset(0);
284
- const int tid = get_local_id(0);
285
- const int il = tid / 16;
286
- const int ir = tid % 16;
287
- const int is = 2 * il;
288
-
289
- __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
290
-
291
- const float dall = vload_half(0, &x[i].d);
292
- const float dmin = vload_half(0, &x[i].dmin);
293
-
294
- __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
295
- __global const uint8_t *qh = x[i].qh + 2 * ir;
296
-
297
- uint8_t sc, m;
298
- get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
299
- const float d1 = dall * sc;
300
- const float m1 = dmin * m;
301
- get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
302
- const float d2 = dall * sc;
303
- const float m2 = dmin * m;
304
-
305
- uint8_t hm = 1 << (2 * il);
306
- y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
307
- y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
308
- hm <<= 1;
309
- y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
310
- y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
311
- }
312
-
313
- __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
314
- {
315
- const int i = get_group_id(0) + get_global_offset(0);
316
- const int tid = get_local_id(0);
317
- const int ip = tid / 32;
318
- const int il = tid - 32 * ip;
319
- const int is = 8 * ip + il / 16;
320
-
321
- __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
322
-
323
- const float d = vload_half(0, &x[i].d);
324
-
325
- __global const uint8_t *ql = x[i].ql + 64 * ip + il;
326
- const uint8_t qh = x[i].qh[32 * ip + il];
327
- __global const int8_t *sc = x[i].scales + is;
328
-
329
- y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
330
- y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
331
- y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
332
- y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
333
- }
334
-
335
- __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
336
-
337
- const int row = get_group_id(0);
338
-
339
- const int num_blocks_per_row = ncols / QK_K;
340
- const int ib0 = row*num_blocks_per_row + get_global_offset(0);
341
-
342
- __global const struct block_q2_K * x = xx + ib0;
343
-
344
- const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
345
- const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
346
-
347
- const int step = 16/K_QUANTS_PER_ITERATION;
348
-
349
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
350
- const int in = tid - step*im; // 0...15 or 0...7
351
-
352
- const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
353
- const int q_offset = 32*im + l0;
354
- const int s_offset = 8*im;
355
- const int y_offset = 128*im + l0;
356
-
357
- tmp[16 * ix + tid] = 0;
358
-
359
- uint32_t aux[4];
360
- const uint8_t * d = (const uint8_t *)aux;
361
- const uint8_t * m = (const uint8_t *)(aux + 2);
362
-
363
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
364
-
365
- __global const float * y = yy + i * QK_K + y_offset;
366
- __global const uint8_t * q = x[i].qs + q_offset;
367
-
368
- const float dall = vload_half(0, &x[i].d);
369
- const float dmin = vload_half(0, &x[i].dmin);
370
-
371
- __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
372
- aux[0] = a[0] & 0x0f0f0f0f;
373
- aux[1] = a[1] & 0x0f0f0f0f;
374
- aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
375
- aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
376
-
377
- float sum1 = 0, sum2 = 0;
378
- for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
379
- sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
380
- + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
381
- + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
382
- + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
383
- + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
384
- + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
385
- + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
386
- +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
387
- sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
388
- + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
389
-
390
- }
391
- tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
392
-
393
- }
394
-
395
- // sum up partial sums and write back result
396
- barrier(CLK_LOCAL_MEM_FENCE);
397
- for (int s=16; s>0; s>>=1) {
398
- if (tid < s) {
399
- tmp[tid] += tmp[tid + s];
400
- }
401
- barrier(CLK_LOCAL_MEM_FENCE);
402
- }
403
- if (tid == 0) {
404
- dst[row] = tmp[0];
405
- }
406
- }
407
-
408
- __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
409
- const uint16_t kmask1 = 0x0303;
410
- const uint16_t kmask2 = 0x0f0f;
411
-
412
- const int row = get_group_id(0);
413
-
414
- const int num_blocks_per_row = ncols / QK_K;
415
- const int ib0 = row*num_blocks_per_row + get_global_offset(0);
416
-
417
- __global const struct block_q3_K * x = xx + ib0;
418
-
419
- const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
420
- const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
421
-
422
- const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
423
- const int step = 16/K_QUANTS_PER_ITERATION;
424
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
425
- const int in = tid - step*im; // 0....15 or 0...7
426
-
427
- const uint8_t m = 1 << (4*im);
428
-
429
- const int l0 = n*in; // 0...15 or 0...14 in steps of 2
430
- const int q_offset = 32*im + l0;
431
- const int y_offset = 128*im + l0;
432
-
433
- uint16_t utmp[4];
434
- const int8_t * s = (const int8_t *)utmp;
435
-
436
- const uint16_t s_shift = 4*im;
437
-
438
- tmp[16 * ix + tid] = 0;
439
-
440
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
441
-
442
- __global const float * y = yy + i * QK_K + y_offset;
443
- __global const uint8_t * q = x[i].qs + q_offset;
444
- __global const uint8_t * h = x[i].hmask + l0;
445
-
446
- __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
447
- utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
448
- utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
449
- utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
450
- utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
451
-
452
- const float d = vload_half(0, &x[i].d);
453
-
454
- float sum = 0;
455
- for (int l = 0; l < n; ++l) {
456
- sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
457
- + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
458
- + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
459
- + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
460
- sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
461
- + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
462
- + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
463
- + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
464
- }
465
- tmp[16 * ix + tid] += d * sum;
466
-
467
- }
468
-
469
- // sum up partial sums and write back result
470
- barrier(CLK_LOCAL_MEM_FENCE);
471
- for (int s=16; s>0; s>>=1) {
472
- if (tid < s) {
473
- tmp[tid] += tmp[tid + s];
474
- }
475
- barrier(CLK_LOCAL_MEM_FENCE);
476
- }
477
- if (tid == 0) {
478
- dst[row] = tmp[0];
479
- }
480
- }
481
-
482
- __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
483
-
484
- //to rename it later, just to test now
485
- const uint16_t kmask1 = 0x3f3f;
486
- const uint16_t kmask2 = 0x0f0f;
487
- const uint16_t kmask3 = 0xc0c0;
488
-
489
- const int row = get_group_id(0);
490
- const int num_blocks_per_row = ncols / QK_K;
491
- const int ib0 = row*num_blocks_per_row + get_global_offset(0);
492
-
493
- const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
494
- const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
495
-
496
- const int step = 8/K_QUANTS_PER_ITERATION;
497
-
498
- const int il = tid/step; // 0...3
499
- const int ir = tid - step*il;// 0...3
500
- const int n = 2*K_QUANTS_PER_ITERATION;
501
-
502
- const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
503
- const int in = il%2;
504
-
505
- const int l0 = n*(2*ir + in);
506
- const int q_offset = 32*im + l0;
507
- const int y_offset = 64*im + l0;
508
-
509
- uint16_t aux[4];
510
- const uint8_t * sc = (const uint8_t *)aux;
511
-
512
- __global const struct block_q4_K * x = xx + ib0;
513
-
514
- tmp[16 * ix + tid] = 0;
515
-
516
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
517
-
518
- __global const uint8_t * q1 = x[i].qs + q_offset;
519
- __global const uint8_t * q2 = q1 + 64;
520
- __global const float * y1 = yy + i*QK_K + y_offset;
521
- __global const float * y2 = y1 + 128;
522
-
523
- const float dall = vload_half(0, &x[i].d);
524
- const float dmin = vload_half(0, &x[i].dmin);
525
-
526
- __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
527
- aux[0] = a[im+0] & kmask1;
528
- aux[1] = a[im+2] & kmask1;
529
- aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
530
- aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
531
-
532
- float4 s = (float4)(0.f);
533
- float smin = 0;
534
- for (int l = 0; l < n; ++l) {
535
- s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
536
- s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
537
- smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
538
- }
539
- tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
540
-
541
- }
542
-
543
- // sum up partial sums and write back result
544
- barrier(CLK_LOCAL_MEM_FENCE);
545
- for (int s=16; s>0; s>>=1) {
546
- if (tid < s) {
547
- tmp[tid] += tmp[tid + s];
548
- }
549
- barrier(CLK_LOCAL_MEM_FENCE);
550
- }
551
- if (tid == 0) {
552
- dst[row] = tmp[0];
553
- }
554
- }
555
-
556
- __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
557
-
558
- const uint16_t kmask1 = 0x3f3f;
559
- const uint16_t kmask2 = 0x0f0f;
560
- const uint16_t kmask3 = 0xc0c0;
561
-
562
- const int row = get_group_id(0);
563
- const int num_blocks_per_row = ncols / QK_K;
564
- const int ib0 = row*num_blocks_per_row + get_global_offset(0);
565
-
566
- const int tid = get_local_id(0)/2; // 0...15
567
- const int ix = get_local_id(0)%2;
568
-
569
- const int il = tid/4; // 0...3
570
- const int ir = tid - 4*il;// 0...3
571
- const int n = 2;
572
-
573
- const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
574
- const int in = il%2;
575
-
576
- const int l0 = n*(2*ir + in);
577
- const int q_offset = 32*im + l0;
578
- const int y_offset = 64*im + l0;
579
-
580
- const uint8_t hm1 = 1 << (2*im);
581
- const uint8_t hm2 = hm1 << 4;
582
-
583
- uint16_t aux[4];
584
- const uint8_t * sc = (const uint8_t *)aux;
585
-
586
- __global const struct block_q5_K * x = xx + ib0;
587
-
588
- tmp[16 * ix + tid] = 0;
589
-
590
- for (int i = ix; i < num_blocks_per_row; i += 2) {
591
-
592
- __global const uint8_t * ql1 = x[i].qs + q_offset;
593
- __global const uint8_t * ql2 = ql1 + 64;
594
- __global const uint8_t * qh = x[i].qh + l0;
595
- __global const float * y1 = yy + i*QK_K + y_offset;
596
- __global const float * y2 = y1 + 128;
597
-
598
- const float dall = vload_half(0, &x[i].d);
599
- const float dmin = vload_half(0, &x[i].dmin);
600
-
601
- __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
602
- aux[0] = a[im+0] & kmask1;
603
- aux[1] = a[im+2] & kmask1;
604
- aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
605
- aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
606
-
607
- float4 sum = (float4)(0.f);
608
- float smin = 0;
609
- for (int l = 0; l < n; ++l) {
610
- sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
611
- + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
612
- sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
613
- + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
614
- sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
615
- + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
616
- sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
617
- + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
618
- smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
619
- + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
620
- }
621
- tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
622
-
623
- }
624
-
625
- // sum up partial sums and write back result
626
- barrier(CLK_LOCAL_MEM_FENCE);
627
- for (int s=16; s>0; s>>=1) {
628
- if (tid < s) {
629
- tmp[tid] += tmp[tid + s];
630
- }
631
- barrier(CLK_LOCAL_MEM_FENCE);
632
- }
633
- if (tid == 0) {
634
- dst[row] = tmp[0];
635
- }
636
- }
637
-
638
- __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
639
-
640
- const int row = get_group_id(0);
641
-
642
- const int num_blocks_per_row = ncols / QK_K;
643
- const int ib0 = row*num_blocks_per_row + get_global_offset(0);
644
-
645
- __global const struct block_q6_K * x = xx + ib0;
646
-
647
- const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
648
- const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
649
-
650
- const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
651
-
652
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
653
- const int in = tid - step*im; // 0...15 or 0...7
654
-
655
- \n#if K_QUANTS_PER_ITERATION == 1\n
656
- const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
657
- const int is = 0;
658
-
659
- \n#else\n
660
-
661
- const int l0 = 4 * in; // 0, 4, 8, ..., 28
662
- const int is = in / 4;
663
-
664
- \n#endif\n
665
-
666
- const int ql_offset = 64*im + l0;
667
- const int qh_offset = 32*im + l0;
668
- const int s_offset = 8*im + is;
669
- const int y_offset = 128*im + l0;
670
-
671
- tmp[16 * ix + tid] = 0; // partial sum for thread in warp
672
-
673
- for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
674
-
675
- __global const float * y = yy + i * QK_K + y_offset;
676
- __global const uint8_t * ql = x[i].ql + ql_offset;
677
- __global const uint8_t * qh = x[i].qh + qh_offset;
678
- __global const int8_t * s = x[i].scales + s_offset;
679
-
680
- const float d = vload_half(0, &x[i].d);
681
-
682
- \n#if K_QUANTS_PER_ITERATION == 1\n
683
- float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
684
- + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
685
- + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
686
- + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
687
- + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
688
- + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
689
- + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
690
- +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
691
- tmp[16 * ix + tid] += sum;
692
- \n#else\n
693
- float sum = 0;
694
- for (int l = 0; l < 4; ++l) {
695
- sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
696
- + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
697
- + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
698
- + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
699
- }
700
- tmp[16 * ix + tid] += sum;
701
- \n#endif\n
702
-
703
- }
704
-
705
- // sum up partial sums and write back result
706
- barrier(CLK_LOCAL_MEM_FENCE);
707
- for (int s=16; s>0; s>>=1) {
708
- if (tid < s) {
709
- tmp[tid] += tmp[tid + s];
710
- }
711
- barrier(CLK_LOCAL_MEM_FENCE);
712
- }
713
- if (tid == 0) {
714
- dst[row] = tmp[0];
715
- }
716
- }
717
- );
718
-
719
-
720
- std::string dequant_template = MULTILINE_QUOTE(
721
- __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
722
- const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
723
-
724
- if (i >= get_global_size(0)) {
725
- return;
726
- }
727
-
728
- const uint qk = QUANT_K;
729
- const uint qr = QUANT_R;
730
-
731
- const int ib = i/qk + get_global_offset(0); // block index
732
- const int iqs = (i%qk)/qr; // quant index
733
- const int iybs = i - i%qk; // y block start index
734
- const int y_offset = qr == 1 ? 1 : qk/2;
735
-
736
- // dequantize
737
- float v0, v1;
738
- DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
739
- y[iybs + iqs + 0] = v0;
740
- y[iybs + iqs + y_offset] = v1;
741
- }
742
- );
743
-
744
- std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
745
- __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
746
- const int local_size = get_local_size(0);
747
- const int row = get_group_id(0);
748
- const int tid = get_local_id(0);
749
-
750
- const uint qk = QUANT_K;
751
- const uint qr = QUANT_R;
752
-
753
- const int col_step = local_size * 2;
754
- const int y_offset = qr == 1 ? 1 : qk/2;
755
-
756
- x += get_global_offset(0);
757
-
758
- tmp[tid] = 0;
759
-
760
- for (int col = tid*2; col < ncols; col += col_step) {
761
- const int ib = (row*ncols + col)/qk; // block index
762
- const int iqs = (col%qk)/qr; // quant index
763
- const int iybs = col - col%qk; // y block start index
764
-
765
- // dequantize
766
- float v0, v1;
767
- DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
768
-
769
- // matrix multiplication
770
- tmp[tid] += v0 * y[iybs + iqs + 0];
771
- tmp[tid] += v1 * y[iybs + iqs + y_offset];
772
- }
773
-
774
- // sum up partial sums and write back result
775
- barrier(CLK_LOCAL_MEM_FENCE);
776
- for (int s=local_size/2; s>0; s>>=1) {
777
- if (tid < s) {
778
- tmp[tid] += tmp[tid + s];
779
- }
780
- barrier(CLK_LOCAL_MEM_FENCE);
781
- }
782
- if (tid == 0) {
783
- dst[row] = tmp[0];
784
- }
785
- }
786
-
787
- );
788
-
789
-
790
- std::string mul_template = MULTILINE_QUOTE(
791
- __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
792
- const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
793
-
794
- if (i >= get_global_size(0)) {
795
- return;
796
- }
797
-
798
- dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
799
- }
800
- );
801
-
802
- std::string add_template = MULTILINE_QUOTE(
803
- __kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
804
- const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
805
-
806
- if (i >= get_global_size(0)) {
807
- return;
808
- }
809
-
810
- dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
811
- }
812
- );
813
-
814
- #define CL_CHECK(err) \
815
- do { \
816
- cl_int err_ = (err); \
817
- if (err_ != CL_SUCCESS) { \
818
- fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
819
- #err, err_, __FILE__, __LINE__); \
820
- exit(1); \
821
- } \
822
- } while (0)
823
-
824
- #define CLBLAST_CHECK(err) \
825
- do { \
826
- CLBlastStatusCode err_ = (err); \
827
- if (err_ != CLBlastSuccess) { \
828
- fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
829
- #err, err_, __FILE__, __LINE__); \
830
- exit(1); \
831
- } \
832
- } while (0)
833
-
834
- std::array<std::string, 5> dequant_str_keys = {
835
- "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
836
- };
837
-
838
- std::array<std::string, 30> dequant_str_values = {
839
- "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
840
- "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
841
- "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
842
- "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
843
- "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
844
- "convert_row_f16", "half", "1", "1", "convert_f16"
845
- };
846
-
847
- std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
848
- "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
849
- "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
850
- "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
851
- "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
852
- "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
853
- "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
854
- };
855
-
856
- std::array<std::string, 2> mul_str_keys = {
857
- "KERNEL_NAME", "TYPE"
858
- };
859
- std::array<std::string, 2> mul_str_values = {
860
- "mul_f32", "float"
861
- };
862
-
863
- static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
864
- size_t pos = 0;
865
- while ((pos = s.find(from, pos)) != std::string::npos) {
866
- s.replace(pos, from.length(), to);
867
- pos += to.length();
868
- }
869
- return s;
870
- }
871
-
872
- static std::string generate_kernels() {
873
- std::stringstream src;
874
- src << program_source << '\n';
875
- src << k_quants_source << '\n';
876
- for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
877
- std::string dequant_kernel = dequant_template;
878
- std::string dmmv_kernel = dequant_mul_mat_vec_template;
879
- for (size_t j = 0; j < dequant_str_keys.size(); j++) {
880
- replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
881
- replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
882
- }
883
- src << dequant_kernel << '\n';
884
- src << dmmv_kernel << '\n';
885
- }
886
- for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
887
- std::string mul_kernel = mul_template;
888
- for (size_t j = 0; j < mul_str_keys.size(); j++) {
889
- replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
890
- }
891
- src << mul_kernel << '\n';
892
- }
893
- src << add_template << '\n';
894
-
895
- return src.str();
896
- }
897
-
898
- static cl_platform_id platform;
899
- static cl_device_id device;
900
- static cl_context context;
901
- static cl_command_queue queue;
902
- static cl_program program;
903
- static cl_kernel convert_row_f16_cl;
904
- static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
905
- static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
906
- static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
907
- static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
908
- static cl_kernel mul_f32_cl;
909
- static cl_kernel add_f32_cl;
910
- static bool fp16_support;
911
-
912
- static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
913
- cl_program p;
914
- char *program_log;
915
- size_t program_size;
916
- size_t log_size;
917
- int err;
918
-
919
- program_size = strlen(program_buffer);
920
-
921
- p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
922
- if(err < 0) {
923
- fprintf(stderr, "OpenCL error creating program");
924
- exit(1);
925
- }
926
-
927
- std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
928
- "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
929
- "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
930
-
931
- err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
932
- if(err < 0) {
933
-
934
- clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
935
- program_log = (char*) malloc(log_size + 1);
936
- program_log[log_size] = '\0';
937
- clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
938
- fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log);
939
- free(program_log);
940
- exit(1);
941
- }
942
-
943
- return p;
944
- }
945
-
946
- void ggml_cl_init(void) {
947
- static bool initialized = false;
948
- if (initialized) {
949
- return;
950
- }
951
- initialized = true;
952
-
953
- cl_int err;
954
-
955
- struct cl_device;
956
- struct cl_platform {
957
- cl_platform_id id;
958
- unsigned number;
959
- char name[128];
960
- char vendor[128];
961
- struct cl_device * devices;
962
- unsigned n_devices;
963
- struct cl_device * default_device;
964
- };
965
-
966
- struct cl_device {
967
- struct cl_platform * platform;
968
- cl_device_id id;
969
- unsigned number;
970
- cl_device_type type;
971
- char name[128];
972
- };
973
-
974
- enum { NPLAT = 16, NDEV = 16 };
975
-
976
- struct cl_platform platforms[NPLAT];
977
- unsigned n_platforms = 0;
978
- struct cl_device devices[NDEV];
979
- unsigned n_devices = 0;
980
- struct cl_device * default_device = NULL;
981
-
982
- platform = NULL;
983
- device = NULL;
984
-
985
- cl_platform_id platform_ids[NPLAT];
986
- CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
987
-
988
- for (unsigned i = 0; i < n_platforms; i++) {
989
- struct cl_platform * p = &platforms[i];
990
- p->number = i;
991
- p->id = platform_ids[i];
992
- CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
993
- CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
994
-
995
- cl_device_id device_ids[NDEV];
996
- cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
997
- if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
998
- p->n_devices = 0;
999
- } else {
1000
- CL_CHECK(clGetDeviceIDsError);
1001
- }
1002
- p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
1003
- p->default_device = NULL;
1004
-
1005
- for (unsigned j = 0; j < p->n_devices; j++) {
1006
- struct cl_device * d = &devices[n_devices];
1007
- d->number = n_devices++;
1008
- d->id = device_ids[j];
1009
- d->platform = p;
1010
- CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
1011
- CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
1012
-
1013
- if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
1014
- p->default_device = d;
1015
- }
1016
- }
1017
-
1018
- if (default_device == NULL && p->default_device != NULL) {
1019
- default_device = p->default_device;
1020
- }
1021
- }
1022
-
1023
- if (n_devices == 0) {
1024
- fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
1025
- exit(1);
1026
- }
1027
-
1028
- char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
1029
- char * user_device_string = getenv("GGML_OPENCL_DEVICE");
1030
- int user_platform_number = -1;
1031
- int user_device_number = -1;
1032
-
1033
- unsigned n;
1034
- if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
1035
- user_platform_number = (int)n;
1036
- }
1037
- if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
1038
- user_device_number = (int)n;
1039
- }
1040
- if (user_platform_number != -1 && user_device_number != -1) {
1041
- cl_platform* platform = &platforms[user_platform_number];
1042
- if ((unsigned)user_device_number >= platform->n_devices) {
1043
- fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number);
1044
- exit(1);
1045
- }
1046
- default_device = &platform->devices[user_device_number];
1047
- } else {
1048
-
1049
- struct cl_device * selected_devices = devices;
1050
- unsigned n_selected_devices = n_devices;
1051
-
1052
- if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
1053
- for (unsigned i = 0; i < n_platforms; i++) {
1054
- struct cl_platform * p = &platforms[i];
1055
- if (strstr(p->name, user_platform_string) != NULL ||
1056
- strstr(p->vendor, user_platform_string) != NULL) {
1057
- user_platform_number = (int)i;
1058
- break;
1059
- }
1060
- }
1061
- if (user_platform_number == -1) {
1062
- fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
1063
- exit(1);
1064
- }
1065
- }
1066
- if (user_platform_number != -1) {
1067
- struct cl_platform * p = &platforms[user_platform_number];
1068
- selected_devices = p->devices;
1069
- n_selected_devices = p->n_devices;
1070
- default_device = p->default_device;
1071
- if (n_selected_devices == 0) {
1072
- fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
1073
- exit(1);
1074
- }
1075
- }
1076
-
1077
- if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
1078
- for (unsigned i = 0; i < n_selected_devices; i++) {
1079
- struct cl_device * d = &selected_devices[i];
1080
- if (strstr(d->name, user_device_string) != NULL) {
1081
- user_device_number = d->number;
1082
- break;
1083
- }
1084
- }
1085
- if (user_device_number == -1) {
1086
- fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
1087
- exit(1);
1088
- }
1089
- }
1090
- if (user_device_number != -1) {
1091
- selected_devices = &devices[user_device_number];
1092
- n_selected_devices = 1;
1093
- default_device = &selected_devices[0];
1094
- }
1095
-
1096
- GGML_ASSERT(n_selected_devices > 0);
1097
-
1098
- if (default_device == NULL) {
1099
- default_device = &selected_devices[0];
1100
- }
1101
- }
1102
-
1103
- fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
1104
- fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
1105
- if (default_device->type != CL_DEVICE_TYPE_GPU) {
1106
- fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
1107
- }
1108
-
1109
- platform = default_device->platform->id;
1110
- device = default_device->id;
1111
-
1112
- size_t ext_str_size;
1113
- clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
1114
- char *ext_buffer = (char *)alloca(ext_str_size + 1);
1115
- clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
1116
- ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
1117
- // Disabled due to faulty outputs
1118
- // Check if ext_buffer contains cl_khr_fp16
1119
- fp16_support = false; // strstr(ext_buffer, "cl_khr_fp16") != NULL;
1120
- // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
1121
-
1122
- cl_context_properties properties[] = {
1123
- (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
1124
- };
1125
-
1126
- CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
1127
-
1128
- CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
1129
- (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
1130
- (queue = clCreateCommandQueue(context, device, 0, &err), err)
1131
- )));
1132
-
1133
- const std::string kernel_src = generate_kernels();
1134
-
1135
- program = build_program_from_source(context, device, kernel_src.c_str());
1136
-
1137
- // FP16 to FP32 kernel
1138
- CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
1139
-
1140
- // Dequantize kernels
1141
- CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
1142
- CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
1143
- CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
1144
- CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
1145
- CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
1146
- CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
1147
- CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
1148
- CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
1149
- CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
1150
- CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
1151
- CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
1152
-
1153
- // dequant mul mat kernel
1154
- CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
1155
- CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
1156
- CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
1157
- CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
1158
- CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
1159
- CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
1160
- CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
1161
- CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
1162
- CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
1163
- CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
1164
- CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
1165
-
1166
- // mul kernel
1167
- CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
1168
-
1169
- CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
1170
- }
1171
-
1172
- static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
1173
- switch (type) {
1174
- case GGML_TYPE_Q4_0:
1175
- return &dequantize_row_q4_0_cl;
1176
- case GGML_TYPE_Q4_1:
1177
- return &dequantize_row_q4_1_cl;
1178
- case GGML_TYPE_Q5_0:
1179
- return &dequantize_row_q5_0_cl;
1180
- case GGML_TYPE_Q5_1:
1181
- return &dequantize_row_q5_1_cl;
1182
- case GGML_TYPE_Q8_0:
1183
- return &dequantize_row_q8_0_cl;
1184
- case GGML_TYPE_Q2_K:
1185
- return &dequantize_block_q2_k_cl;
1186
- case GGML_TYPE_Q3_K:
1187
- return &dequantize_block_q3_k_cl;
1188
- case GGML_TYPE_Q4_K:
1189
- return &dequantize_block_q4_k_cl;
1190
- case GGML_TYPE_Q5_K:
1191
- return &dequantize_block_q5_k_cl;
1192
- case GGML_TYPE_Q6_K:
1193
- return &dequantize_block_q6_k_cl;
1194
- case GGML_TYPE_F16:
1195
- return &convert_row_f16_cl;
1196
- default:
1197
- return nullptr;
1198
- }
1199
- }
1200
-
1201
- static size_t ggml_cl_global_denom(ggml_type type) {
1202
- switch (type) {
1203
- case GGML_TYPE_Q4_0:
1204
- case GGML_TYPE_Q4_1:
1205
- case GGML_TYPE_Q5_0:
1206
- case GGML_TYPE_Q5_1:
1207
- case GGML_TYPE_Q8_0:
1208
- return 1;
1209
- case GGML_TYPE_Q2_K:
1210
- case GGML_TYPE_Q3_K:
1211
- return 4;
1212
- case GGML_TYPE_Q4_K:
1213
- return 8;
1214
- case GGML_TYPE_Q5_K:
1215
- case GGML_TYPE_Q6_K:
1216
- return 4;
1217
- case GGML_TYPE_F16:
1218
- default:
1219
- return 1;
1220
- }
1221
- }
1222
-
1223
- static size_t ggml_cl_local_size(ggml_type type) {
1224
- switch (type) {
1225
- case GGML_TYPE_Q4_0:
1226
- case GGML_TYPE_Q4_1:
1227
- case GGML_TYPE_Q5_0:
1228
- case GGML_TYPE_Q5_1:
1229
- case GGML_TYPE_Q8_0:
1230
- return 0;
1231
- case GGML_TYPE_Q2_K:
1232
- case GGML_TYPE_Q3_K:
1233
- return 64;
1234
- case GGML_TYPE_Q4_K:
1235
- return 32;
1236
- case GGML_TYPE_Q5_K:
1237
- case GGML_TYPE_Q6_K:
1238
- return 64;
1239
- case GGML_TYPE_F16:
1240
- default:
1241
- return 0;
1242
- }
1243
- }
1244
-
1245
- static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
1246
- switch (type) {
1247
- case GGML_TYPE_Q4_0:
1248
- return &dequantize_mul_mat_vec_q4_0_cl;
1249
- case GGML_TYPE_Q4_1:
1250
- return &dequantize_mul_mat_vec_q4_1_cl;
1251
- case GGML_TYPE_Q5_0:
1252
- return &dequantize_mul_mat_vec_q5_0_cl;
1253
- case GGML_TYPE_Q5_1:
1254
- return &dequantize_mul_mat_vec_q5_1_cl;
1255
- case GGML_TYPE_Q8_0:
1256
- return &dequantize_mul_mat_vec_q8_0_cl;
1257
- case GGML_TYPE_F16:
1258
- return &convert_mul_mat_vec_f16_cl;
1259
- case GGML_TYPE_Q2_K:
1260
- return &dequantize_mul_mat_vec_q2_K_cl;
1261
- case GGML_TYPE_Q3_K:
1262
- return &dequantize_mul_mat_vec_q3_K_cl;
1263
- case GGML_TYPE_Q4_K:
1264
- return &dequantize_mul_mat_vec_q4_K_cl;
1265
- case GGML_TYPE_Q5_K:
1266
- return &dequantize_mul_mat_vec_q5_K_cl;
1267
- case GGML_TYPE_Q6_K:
1268
- return &dequantize_mul_mat_vec_q6_K_cl;
1269
- default:
1270
- return nullptr;
1271
- }
1272
- }
1273
-
1274
- // buffer pool for cl
1275
- #define MAX_CL_BUFFERS 256
1276
-
1277
- struct scoped_spin_lock {
1278
- std::atomic_flag& lock;
1279
- scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
1280
- while (lock.test_and_set(std::memory_order_acquire)) {
1281
- ; // spin
1282
- }
1283
- }
1284
- ~scoped_spin_lock() {
1285
- lock.clear(std::memory_order_release);
1286
- }
1287
- scoped_spin_lock(const scoped_spin_lock&) = delete;
1288
- scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
1289
- };
1290
-
1291
- struct cl_buffer {
1292
- cl_mem mem;
1293
- size_t size = 0;
1294
- };
1295
-
1296
- static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
1297
- static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
1298
-
1299
- static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
1300
- scoped_spin_lock lock(g_cl_pool_lock);
1301
- cl_int err;
1302
-
1303
- int best_i = -1;
1304
- size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
1305
- int worst_i = -1;
1306
- size_t worst_size = 0; //largest unused buffer seen so far
1307
- for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
1308
- cl_buffer &b = g_cl_buffer_pool[i];
1309
- if (b.size > 0 && b.size >= size && b.size < best_size)
1310
- {
1311
- best_i = i;
1312
- best_size = b.size;
1313
- }
1314
- if (b.size > 0 && b.size > worst_size)
1315
- {
1316
- worst_i = i;
1317
- worst_size = b.size;
1318
- }
1319
- }
1320
- if(best_i!=-1) //found the smallest buffer that fits our needs
1321
- {
1322
- cl_buffer& b = g_cl_buffer_pool[best_i];
1323
- cl_mem mem = b.mem;
1324
- *actual_size = b.size;
1325
- b.size = 0;
1326
- return mem;
1327
- }
1328
- if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
1329
- {
1330
- cl_buffer& b = g_cl_buffer_pool[worst_i];
1331
- cl_mem mem = b.mem;
1332
- b.size = 0;
1333
- clReleaseMemObject(mem);
1334
- }
1335
- cl_mem mem;
1336
- CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
1337
- *actual_size = size;
1338
- return mem;
1339
- }
1340
-
1341
- static void ggml_cl_pool_free(cl_mem mem, size_t size) {
1342
- scoped_spin_lock lock(g_cl_pool_lock);
1343
-
1344
- for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
1345
- cl_buffer& b = g_cl_buffer_pool[i];
1346
- if (b.size == 0) {
1347
- b.mem = mem;
1348
- b.size = size;
1349
- return;
1350
- }
1351
- }
1352
- fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
1353
- clReleaseMemObject(mem);
1354
- }
1355
-
1356
- void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1357
- if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
1358
- return;
1359
- }
1360
-
1361
- cl_mem mem = (cl_mem)tensor->extra;
1362
- clReleaseMemObject(mem);
1363
- }
1364
-
1365
- static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
1366
- cl_int err;
1367
- const uint64_t ne0 = src->ne[0];
1368
- const uint64_t ne1 = src->ne[1];
1369
- const uint64_t nb0 = src->nb[0];
1370
- const uint64_t nb1 = src->nb[1];
1371
- const uint64_t nb2 = src->nb[2];
1372
- const uint64_t nb3 = src->nb[3];
1373
- const enum ggml_type type = src->type;
1374
- const size_t ts = ggml_type_size(type);
1375
- const size_t bs = ggml_blck_size(type);
1376
- const uint64_t row_size = ts*ne0/bs;
1377
-
1378
- const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1379
- if (nb0 == ts && nb1 == row_size) {
1380
- return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
1381
- }
1382
- if (nb0 == ts) {
1383
- const size_t buffer_origin[3] = { offset, 0, 0 };
1384
- const size_t host_origin[3] = { 0, 0, 0 };
1385
- const size_t region[3] = { row_size, ne1, 1 };
1386
- return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
1387
- }
1388
- std::vector<cl_event> events;
1389
- if (ev && ne1>1) events.reserve(ne1-1);
1390
- for (uint64_t i1 = 0; i1 < ne1; i1++) {
1391
- // pretend the row is a matrix with cols=1
1392
- const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
1393
- const size_t host_origin[3] = { 0, 0, 0 };
1394
- const size_t region[3] = { ts, ne0/bs, 1 };
1395
- // if an event is requested, make the last write wait for all previous writes to complete
1396
- if (ev && i1) {
1397
- events.push_back(*ev);
1398
- }
1399
- cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
1400
- err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
1401
- if (err != CL_SUCCESS) {
1402
- for (auto event : events) {
1403
- clReleaseEvent(event);
1404
- }
1405
- return err;
1406
- }
1407
- }
1408
- for (auto event : events) {
1409
- CL_CHECK(clReleaseEvent(event));
1410
- }
1411
- return CL_SUCCESS;
1412
- }
1413
-
1414
- static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1415
- GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
1416
- const int64_t ne00 = src0->ne[0];
1417
- const int64_t ne01 = src0->ne[1];
1418
- const int64_t ne02 = src0->ne[2];
1419
- const int64_t ne03 = src0->ne[3];
1420
- const int64_t ne10 = src1->ne[0];
1421
- const int64_t ne11 = src1->ne[1];
1422
- const int64_t ne12 = src1->ne[2];
1423
- const int64_t ne13 = src1->ne[3];
1424
- const int nb2 = dst->nb[2];
1425
- const int nb3 = dst->nb[3];
1426
- size_t x_size;
1427
- size_t d_size;
1428
-
1429
- cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
1430
- cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1431
- cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
1432
-
1433
-
1434
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1435
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1436
- cl_event ev;
1437
-
1438
- // copy src0 to device
1439
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
1440
-
1441
- const int64_t i13 = i03%ne13;
1442
- const int64_t i12 = i02%ne12;
1443
- const int i1 = i13*ne12*ne11 + i12*ne11;
1444
-
1445
- cl_int x_offset = 0;
1446
- cl_int y_offset = i1*ne10;
1447
- cl_int d_offset = 0;
1448
-
1449
- size_t global = ne00 * ne01;
1450
- cl_int ky = ne10 * ne11;
1451
-
1452
- CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1453
- CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1454
- CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1455
- CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1456
- CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1457
- CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1458
- CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1459
- CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1460
-
1461
- CL_CHECK(clReleaseEvent(ev));
1462
- CL_CHECK(clFinish(queue));
1463
-
1464
- // copy dst to host
1465
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1466
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
1467
- }
1468
- }
1469
- ggml_cl_pool_free(d_X, x_size);
1470
- ggml_cl_pool_free(d_D, d_size);
1471
- }
1472
-
1473
- void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1474
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1475
- ggml_cl_mul_f32(src0, src1, dst);
1476
- }
1477
-
1478
- static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1479
- GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
1480
- const int64_t ne00 = src0->ne[0];
1481
- const int64_t ne01 = src0->ne[1];
1482
- const int64_t ne02 = src0->ne[2];
1483
- const int64_t ne03 = src0->ne[3];
1484
- const int64_t ne10 = src1->ne[0];
1485
- const int64_t ne11 = src1->ne[1];
1486
- const int64_t ne12 = src1->ne[2];
1487
- const int64_t ne13 = src1->ne[3];
1488
- const int nb2 = dst->nb[2];
1489
- const int nb3 = dst->nb[3];
1490
- size_t x_size;
1491
- size_t d_size;
1492
-
1493
- cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
1494
- cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1495
- cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
1496
-
1497
-
1498
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1499
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1500
- cl_event ev;
1501
-
1502
- // copy src0 to device
1503
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
1504
-
1505
- const int64_t i13 = i03%ne13;
1506
- const int64_t i12 = i02%ne12;
1507
- const int i1 = i13*ne12*ne11 + i12*ne11;
1508
-
1509
- cl_int x_offset = 0;
1510
- cl_int y_offset = i1*ne10;
1511
- cl_int d_offset = 0;
1512
-
1513
- size_t global = ne00 * ne01;
1514
- cl_int ky = ne10 * ne11;
1515
-
1516
- CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
1517
- CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
1518
- CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
1519
- CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
1520
- CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
1521
- CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
1522
- CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
1523
- CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1524
-
1525
- CL_CHECK(clReleaseEvent(ev));
1526
- CL_CHECK(clFinish(queue));
1527
-
1528
- // copy dst to host
1529
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1530
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
1531
- }
1532
- }
1533
- ggml_cl_pool_free(d_X, x_size);
1534
- ggml_cl_pool_free(d_D, d_size);
1535
- }
1536
-
1537
- void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1538
- GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1539
- ggml_cl_add_f32(src0, src1, dst);
1540
- }
1541
-
1542
- static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1543
- const int64_t ne00 = src0->ne[0];
1544
- const int64_t ne01 = src0->ne[1];
1545
- const int64_t ne02 = src0->ne[2];
1546
- const int64_t ne03 = src0->ne[3];
1547
-
1548
- const int64_t ne10 = src1->ne[0];
1549
- const int64_t ne11 = src1->ne[1];
1550
- const int64_t ne12 = src1->ne[2];
1551
- const int64_t ne13 = src1->ne[3];
1552
-
1553
- const int nb2 = dst->nb[2];
1554
- const int nb3 = dst->nb[3];
1555
-
1556
- const int64_t r2 = ne12 / ne02;
1557
- const int64_t r3 = ne13 / ne03;
1558
-
1559
- const float alpha = 1.0f;
1560
- const float beta = 0.0f;
1561
- const int x_ne = ne01 * ne00;
1562
- const int y_ne = ne11 * ne10;
1563
- const int d_ne = ne11 * ne01;
1564
-
1565
- size_t x_size;
1566
- size_t y_size;
1567
- size_t d_size;
1568
- cl_mem d_X;
1569
- if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
1570
- d_X = (cl_mem) src0->extra;
1571
- } else {
1572
- d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1573
- }
1574
- cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1575
- cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1576
-
1577
- size_t x_offset = 0;
1578
-
1579
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1580
- // TODO: copy src0 here when r3>1
1581
- for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1582
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1583
- if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1584
- x_offset = (i03 * ne02 + i02) * x_ne;
1585
- } else {
1586
- // copy src0 to device
1587
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1588
- }
1589
-
1590
- for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1591
- // copy src1 to device
1592
- if (src1->backend == GGML_BACKEND_TYPE_CPU) {
1593
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1594
- }
1595
-
1596
- CL_CHECK(clFinish(queue));
1597
-
1598
- // compute
1599
- cl_event ev_sgemm;
1600
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1601
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1602
- ne01, ne11, ne10,
1603
- alpha,
1604
- d_X, x_offset, ne00,
1605
- d_Y, 0, ne10,
1606
- beta,
1607
- d_D, 0, ne01,
1608
- &queue, &ev_sgemm);
1609
-
1610
- if (status != clblast::StatusCode::kSuccess) {
1611
- GGML_ASSERT(false);
1612
- }
1613
-
1614
- // copy dst to host
1615
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
1616
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1617
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1618
- }
1619
- }
1620
- }
1621
- }
1622
- }
1623
-
1624
- if (src0->backend != GGML_BACKEND_TYPE_GPU) {
1625
- ggml_cl_pool_free(d_X, x_size);
1626
- }
1627
- if (src1->backend != GGML_BACKEND_TYPE_GPU) {
1628
- ggml_cl_pool_free(d_Y, y_size);
1629
- }
1630
- if (dst->backend != GGML_BACKEND_TYPE_GPU) {
1631
- ggml_cl_pool_free(d_D, d_size);
1632
- }
1633
- }
1634
-
1635
- static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
1636
- GGML_ASSERT(fp16_support);
1637
-
1638
- const int64_t ne00 = src0->ne[0];
1639
- const int64_t ne01 = src0->ne[1];
1640
- const int64_t ne02 = src0->ne[2];
1641
- const int64_t ne03 = src0->ne[3];
1642
-
1643
- const int64_t ne10 = src1->ne[0];
1644
- const int64_t ne11 = src1->ne[1];
1645
- const int64_t ne12 = src1->ne[2];
1646
- const int64_t ne13 = src1->ne[3];
1647
-
1648
- const int nb10 = src1->nb[0];
1649
- const int nb11 = src1->nb[1];
1650
- const int nb12 = src1->nb[2];
1651
- const int nb13 = src1->nb[3];
1652
-
1653
- const int nb2 = dst->nb[2];
1654
- const int nb3 = dst->nb[3];
1655
-
1656
- const int64_t r2 = ne12 / ne02;
1657
- const int64_t r3 = ne13 / ne03;
1658
-
1659
- const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
1660
- const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
1661
- const int x_ne = ne01 * ne00;
1662
- const int y_ne = ne11 * ne10;
1663
- const int d_ne = ne11 * ne01;
1664
-
1665
- GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
1666
- GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
1667
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
1668
-
1669
- size_t x_size;
1670
- size_t y_size;
1671
- size_t d_size;
1672
- cl_mem d_X;
1673
- if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
1674
- d_X = (cl_mem) src0->extra;
1675
- } else {
1676
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1677
- }
1678
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
1679
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
1680
-
1681
- bool src1_cont_rows = nb10 == sizeof(float);
1682
- bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1683
-
1684
- size_t x_offset = 0;
1685
-
1686
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1687
- // TODO: copy src0 here when r3>1
1688
- for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1689
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1690
- if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1691
- x_offset = (i03 * ne02 + i02) * x_ne;
1692
- } else {
1693
- // copy src0 to device
1694
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1695
- }
1696
-
1697
- // FIXME: convert on device
1698
-
1699
- for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
1700
- // convert src1 to fp16
1701
- // TODO: use multiple threads
1702
- char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1703
- if (src1_cont_rows) {
1704
- if (src1_cont_cols) {
1705
- ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1706
- }
1707
- else {
1708
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1709
- ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1710
- }
1711
- }
1712
- }
1713
- else {
1714
- for (int64_t i11 = 0; i11 < ne11; i11++) {
1715
- for (int64_t i10 = 0; i10 < ne10; i10++) {
1716
- // very slow due to no inlining
1717
- tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
1718
- }
1719
- }
1720
- }
1721
-
1722
- // copy src1 to device
1723
- CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1724
-
1725
- CL_CHECK(clFinish(queue));
1726
-
1727
- // compute
1728
- cl_event ev_sgemm;
1729
- clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1730
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1731
- ne01, ne11, ne10,
1732
- alpha,
1733
- d_X, x_offset, ne00,
1734
- d_Y, 0, ne10,
1735
- beta,
1736
- d_D, 0, ne01,
1737
- &queue, &ev_sgemm);
1738
-
1739
- if (status != clblast::StatusCode::kSuccess) {
1740
- GGML_ASSERT(false);
1741
- }
1742
-
1743
- // copy dst to host, then convert to float
1744
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
1745
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1746
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1747
- ggml_fp16_to_fp32_row(tmp, d, d_ne);
1748
- } else {
1749
- // FIXME: convert dst to fp32 on device
1750
- }
1751
- }
1752
- }
1753
- }
1754
- }
1755
-
1756
- if (src0->backend != GGML_BACKEND_TYPE_GPU) {
1757
- ggml_cl_pool_free(d_X, x_size);
1758
- }
1759
- ggml_cl_pool_free(d_Y, y_size);
1760
- ggml_cl_pool_free(d_D, d_size);
1761
- }
1762
-
1763
- static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1764
- const int64_t ne00 = src0->ne[0];
1765
- const int64_t ne01 = src0->ne[1];
1766
- const int64_t ne02 = src0->ne[2];
1767
- const int64_t ne03 = src0->ne[3];
1768
-
1769
- const int64_t ne10 = src1->ne[0];
1770
- const int64_t ne11 = src1->ne[1];
1771
- const int64_t ne12 = src1->ne[2];
1772
- const int64_t ne13 = src1->ne[3];
1773
-
1774
- const int nb2 = dst->nb[2];
1775
- const int nb3 = dst->nb[3];
1776
- const ggml_type type = src0->type;
1777
- const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
1778
-
1779
- const int64_t r2 = ne12 / ne02;
1780
- const int64_t r3 = ne13 / ne03;
1781
-
1782
- const float alpha = 1.0f;
1783
- const float beta = 0.0f;
1784
- const int x_ne = ne01 * ne00;
1785
- const int y_ne = ne11 * ne10;
1786
- const int d_ne = ne11 * ne01;
1787
- const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
1788
- const size_t q_sz = ggml_type_size(type) * x_bps;
1789
-
1790
- size_t x_size;
1791
- size_t y_size;
1792
- size_t d_size;
1793
- size_t q_size;
1794
- cl_mem d_X;
1795
- if (!mul_mat_vec) {
1796
- d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1797
- }
1798
- cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1799
- cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1800
- cl_mem d_Q;
1801
- if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1802
- d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
1803
- }
1804
-
1805
- cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
1806
- cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
1807
- GGML_ASSERT(to_fp32_cl != nullptr);
1808
-
1809
- const size_t global_denom = ggml_cl_global_denom(type);
1810
- const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
1811
-
1812
- size_t ev_idx = 0;
1813
- std::vector<cl_event> events;
1814
-
1815
- for (int64_t i03 = 0; i03 < ne03; i03++) {
1816
- // TODO: copy and dequantize src0 here when r3>1
1817
- for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
1818
- for (int64_t i02 = 0; i02 < ne02; i02++) {
1819
- // copy src0 to device if necessary
1820
- if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1821
- events.emplace_back();
1822
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1823
- } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
1824
- d_Q = (cl_mem) src0->extra;
1825
- } else {
1826
- GGML_ASSERT(false);
1827
- }
1828
-
1829
- if (!mul_mat_vec) {
1830
- // convert src0 to fp32 on device
1831
- const size_t global = x_ne / global_denom;
1832
- const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1833
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1834
- CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1835
- CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1836
- }
1837
-
1838
- int64_t i12 = i02 * r2;
1839
- int64_t e12 = i12 + r2;
1840
- events.reserve(e12 - i12);
1841
- for (; i12 < e12; i12++) {
1842
- if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1843
- // copy src1 to device
1844
- events.emplace_back();
1845
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
1846
-
1847
- // compute
1848
- const size_t global = ne01 * local;
1849
- const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
1850
- const cl_int ncols = ne00;
1851
- events.emplace_back();
1852
- CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1853
- CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1854
- CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1855
- CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1856
- CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1857
- CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1858
- } else { // CLBlast matrix matrix multiplication
1859
- // copy src1 to device
1860
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
1861
-
1862
- // wait for conversion
1863
- CL_CHECK(clFinish(queue));
1864
-
1865
- // compute
1866
- events.emplace_back();
1867
- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1868
- clblast::Transpose::kYes, clblast::Transpose::kNo,
1869
- ne01, ne11, ne10,
1870
- alpha,
1871
- d_X, 0, ne00,
1872
- d_Y, 0, ne10,
1873
- beta,
1874
- d_D, 0, ne01,
1875
- &queue, events.data() + ev_idx++);
1876
-
1877
- if (status != clblast::StatusCode::kSuccess) {
1878
- GGML_ASSERT(false);
1879
- }
1880
- }
1881
-
1882
- // copy dst to host
1883
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1884
- CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1885
- for (auto *event : events) {
1886
- clReleaseEvent(event);
1887
- }
1888
-
1889
- ev_idx = 0;
1890
- events.clear();
1891
- }
1892
- }
1893
- }
1894
- }
1895
-
1896
- if (!mul_mat_vec) {
1897
- ggml_cl_pool_free(d_X, x_size);
1898
- }
1899
- ggml_cl_pool_free(d_Y, y_size);
1900
- ggml_cl_pool_free(d_D, d_size);
1901
- if (src0->backend == GGML_BACKEND_TYPE_CPU) {
1902
- ggml_cl_pool_free(d_Q, q_size);
1903
- }
1904
- }
1905
-
1906
-
1907
- bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
1908
- const int64_t ne10 = src1->ne[0];
1909
-
1910
- const int64_t ne0 = dst->ne[0];
1911
- const int64_t ne1 = dst->ne[1];
1912
-
1913
- // TODO: find the optimal values for these
1914
- if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1915
- src1->type == GGML_TYPE_F32 &&
1916
- dst->type == GGML_TYPE_F32 &&
1917
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
1918
- return true;
1919
- }
1920
-
1921
- return false;
1922
- }
1923
-
1924
- static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
1925
- // If device doesn't support FP16
1926
- if (!fp16_support) {
1927
- return false;
1928
- }
1929
-
1930
- size_t src0_sz = ggml_nbytes(src0);
1931
- size_t src1_sz = ggml_nbytes(src1);
1932
-
1933
- // mul_mat_q: src0 is converted to fp32 on device
1934
- size_t mul_mat_q_transfer = src0_sz + src1_sz;
1935
-
1936
- // mul_mat_f16: src1 is converted to fp16 on cpu
1937
- size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
1938
-
1939
- // choose the smaller one to transfer to the device
1940
- // TODO: this is not always the best choice due to the overhead of converting to fp16
1941
- return mul_mat_f16_transfer < mul_mat_q_transfer;
1942
- }
1943
-
1944
- void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
1945
- GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
1946
-
1947
- if (src0->type == GGML_TYPE_F32) {
1948
- ggml_cl_mul_mat_f32(src0, src1, dst);
1949
- }
1950
- else if (src0->type == GGML_TYPE_F16) {
1951
- if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1952
- ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
1953
- }
1954
- else {
1955
- ggml_cl_mul_mat_q_f32(src0, src1, dst);
1956
- }
1957
- }
1958
- else if (ggml_is_quantized(src0->type)) {
1959
- ggml_cl_mul_mat_q_f32(src0, src1, dst);
1960
- }
1961
- else {
1962
- GGML_ASSERT(false);
1963
- }
1964
- }
1965
-
1966
- size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1967
- if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1968
- return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
1969
- }
1970
- return 0;
1971
- }
1972
-
1973
- void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1974
- const int64_t ne0 = tensor->ne[0];
1975
- const int64_t ne1 = tensor->ne[1];
1976
- const int64_t ne2 = tensor->ne[2];
1977
- const int64_t ne3 = tensor->ne[3];
1978
-
1979
- const ggml_type type = tensor->type;
1980
- const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
1981
- const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
1982
-
1983
- size_t q_size;
1984
- cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1985
-
1986
- tensor->data = data;
1987
- // copy tensor to device
1988
- size_t offset = 0;
1989
- for (int64_t i3 = 0; i3 < ne3; i3++) {
1990
- for (int64_t i2 = 0; i2 < ne2; i2++) {
1991
- CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
1992
- offset += s_sz;
1993
- }
1994
- }
1995
-
1996
- CL_CHECK(clFinish(queue));
1997
-
1998
- tensor->extra = dst;
1999
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
2000
- }
2001
-
2002
- // ggml-backend
2003
-
2004
- // buffer
2005
-
2006
- struct ggml_backend_opencl_buffer_context {
2007
- ~ggml_backend_opencl_buffer_context() {
2008
- if (buffer) {
2009
- clReleaseMemObject(buffer);
2010
- }
2011
- for (auto * sub_buffer : sub_buffers) {
2012
- clReleaseMemObject(sub_buffer);
2013
- }
2014
- }
2015
-
2016
- cl_mem buffer;
2017
- std::vector<cl_mem> sub_buffers;
2018
- };
2019
-
2020
- static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
2021
-
2022
- static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
2023
- return "OpenCL";
2024
-
2025
- GGML_UNUSED(buffer);
2026
- }
2027
-
2028
- static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2029
- ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
2030
- delete ctx;
2031
- }
2032
-
2033
- static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
2034
- return cl_ptr_base;
2035
-
2036
- GGML_UNUSED(buffer);
2037
- }
2038
-
2039
- static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
2040
- if (tensor->view_src != NULL && tensor->view_offs == 0) {
2041
- tensor->extra = tensor->view_src->extra;
2042
- } else {
2043
- ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
2044
- cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
2045
- cl_int err;
2046
- cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
2047
- CL_CHECK(err);
2048
- ctx->sub_buffers.push_back(sub_buffer);
2049
- tensor->extra = sub_buffer;
2050
- }
2051
- tensor->backend = GGML_BACKEND_TYPE_GPU;
2052
- }
2053
-
2054
- static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2055
- cl_mem tensor_buffer = (cl_mem) tensor->extra;
2056
- CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
2057
- CL_CHECK(clFinish(queue));
2058
-
2059
- GGML_UNUSED(buffer);
2060
- }
2061
-
2062
- static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
2063
- cl_mem tensor_buffer = (cl_mem) tensor->extra;
2064
- CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
2065
- CL_CHECK(clFinish(queue));
2066
-
2067
- GGML_UNUSED(buffer);
2068
- }
2069
-
2070
- static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
2071
- ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
2072
- CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
2073
- CL_CHECK(clFinish(queue));
2074
- }
2075
-
2076
- static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
2077
- ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
2078
- for (auto * sub_buffer : ctx->sub_buffers) {
2079
- clReleaseMemObject(sub_buffer);
2080
- }
2081
- ctx->sub_buffers.clear();
2082
- }
2083
-
2084
- static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
2085
- /* .get_name = */ ggml_backend_opencl_buffer_get_name,
2086
- /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
2087
- /* .get_base = */ ggml_backend_opencl_buffer_get_base,
2088
- /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
2089
- /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
2090
- /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
2091
- /* .cpy_tensor = */ NULL,
2092
- /* .clear = */ ggml_backend_opencl_buffer_clear,
2093
- /* .reset = */ ggml_backend_opencl_buffer_reset,
2094
- };
2095
-
2096
- // buffer type
2097
-
2098
- static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
2099
- return "OpenCL";
2100
-
2101
- GGML_UNUSED(buffer_type);
2102
- }
2103
-
2104
- static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
2105
- ggml_cl_init();
2106
-
2107
- cl_int err;
2108
- cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
2109
- if (err != CL_SUCCESS) {
2110
- fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
2111
- return nullptr;
2112
- }
2113
-
2114
- ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
2115
-
2116
- return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
2117
- }
2118
-
2119
- static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
2120
- // FIXME: not thread safe, device may not be initialized yet
2121
- static cl_uint alignment = -1;
2122
- if (alignment == (cl_uint)-1) {
2123
- ggml_cl_init();
2124
- clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
2125
- alignment /= 8; // bits to bytes
2126
- }
2127
- return alignment;
2128
-
2129
- GGML_UNUSED(buffer_type);
2130
- }
2131
-
2132
- static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
2133
- static size_t max_size = -1;
2134
- if (max_size == (size_t)-1) {
2135
- ggml_cl_init();
2136
- clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
2137
- }
2138
- return max_size;
2139
- }
2140
-
2141
- static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
2142
- //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
2143
- return ggml_backend_is_cpu(backend);
2144
-
2145
- GGML_UNUSED(buffer_type);
2146
- }
2147
-
2148
- static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
2149
- /* .get_name = */ ggml_backend_opencl_buffer_type_name,
2150
- /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
2151
- /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
2152
- /* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
2153
- /* .get_alloc_size = */ NULL,
2154
- /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
2155
- /* .is_host = */ NULL,
2156
- };
2157
-
2158
-
2159
- ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
2160
- static ggml_backend_buffer_type buffer_type = {
2161
- /* .iface = */ ggml_backend_opencl_buffer_type_interface,
2162
- /* .context = */ nullptr,
2163
- };
2164
-
2165
- return &buffer_type;
2166
- }
2167
-
2168
- #if 0
2169
- // host buffer type
2170
-
2171
- static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
2172
- return "CL_Host";
2173
-
2174
- GGML_UNUSED(buft);
2175
- }
2176
-
2177
- static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
2178
- return "CL_Host";
2179
-
2180
- GGML_UNUSED(buffer);
2181
- }
2182
-
2183
- static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2184
- ggml_cl_host_free(buffer->context);
2185
- }
2186
-
2187
- static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
2188
- void * ptr = ggml_cl_host_malloc(size);
2189
-
2190
- if (ptr == nullptr) {
2191
- // fallback to cpu buffer
2192
- return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
2193
- }
2194
-
2195
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
2196
- buffer->buft = buft;
2197
- buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
2198
- buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
2199
-
2200
- return buffer;
2201
- }
2202
-
2203
- ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
2204
- static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
2205
- /* .iface = */ {
2206
- /* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
2207
- /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
2208
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
2209
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
2210
- /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
2211
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
2212
- /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
2213
- },
2214
- /* .context = */ nullptr,
2215
- };
2216
-
2217
- return &ggml_backend_opencl_buffer_type_host;
2218
- }
2219
-
2220
- // backend
2221
-
2222
- static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
2223
- return "OpenCL";
2224
-
2225
- GGML_UNUSED(backend);
2226
- }
2227
-
2228
- static void ggml_backend_opencl_free(ggml_backend_t backend) {
2229
- GGML_UNUSED(backend);
2230
- }
2231
-
2232
- static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
2233
- return ggml_backend_opencl_buffer_type();
2234
-
2235
- GGML_UNUSED(backend);
2236
- }
2237
-
2238
- static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
2239
- for (int i = 0; i < graph->n_nodes; ++i) {
2240
- ggml_tensor * node = graph->nodes[i];
2241
-
2242
- if (ggml_is_empty(node)) {
2243
- continue;
2244
- }
2245
-
2246
- switch (node->op) {
2247
- case GGML_OP_MUL_MAT:
2248
- ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
2249
- break;
2250
- case GGML_OP_MUL:
2251
- ggml_cl_mul(node->src[0], node->src[1], node);
2252
- break;
2253
- default:
2254
- GGML_ASSERT(false);
2255
- }
2256
- }
2257
-
2258
- return GGML_STATUS_SUCCESS;
2259
-
2260
- GGML_UNUSED(backend);
2261
- }
2262
-
2263
- static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
2264
- switch (op->op) {
2265
- case GGML_OP_MUL_MAT:
2266
- return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
2267
- case GGML_OP_MUL:
2268
- // return ggml_can_repeat_rows(op->src[1], op->src[0]);
2269
- return true;
2270
- default:
2271
- return false;
2272
- }
2273
-
2274
- GGML_UNUSED(backend);
2275
- }
2276
-
2277
- static ggml_backend_i opencl_backend_i = {
2278
- /* .get_name = */ ggml_backend_opencl_name,
2279
- /* .free = */ ggml_backend_opencl_free,
2280
- /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
2281
- /* .set_tensor_async = */ NULL,
2282
- /* .get_tensor_async = */ NULL,
2283
- /* .cpy_tensor_from_async = */ NULL,
2284
- /* .cpy_tensor_to_async = */ NULL,
2285
- /* .synchronize = */ NULL,
2286
- /* .graph_plan_create = */ NULL,
2287
- /* .graph_plan_free = */ NULL,
2288
- /* .graph_plan_compute = */ NULL,
2289
- /* .graph_compute = */ ggml_backend_opencl_graph_compute,
2290
- /* .supports_op = */ ggml_backend_opencl_supports_op,
2291
- };
2292
-
2293
- ggml_backend_t ggml_backend_opencl_init() {
2294
- ggml_backend_t backend = new ggml_backend {
2295
- /* .interface = */ opencl_backend_i,
2296
- /* .context = */ nullptr
2297
- };
2298
-
2299
- return backend;
2300
- }
2301
-
2302
- bool ggml_backend_is_opencl(ggml_backend_t backend) {
2303
- return backend && backend->iface.get_name == ggml_backend_opencl_name;
2304
- }
2305
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml-opencl.h DELETED
@@ -1,36 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml.h"
4
- #include "ggml-backend.h"
5
-
6
- #ifdef __cplusplus
7
- extern "C" {
8
- #endif
9
-
10
- GGML_API void ggml_cl_init(void);
11
-
12
- GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
13
- GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
14
- GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
15
- GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
16
- GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
17
-
18
- // GGML_API void * ggml_cl_host_malloc(size_t size);
19
- // GGML_API void ggml_cl_host_free(void * ptr);
20
-
21
- GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
22
-
23
- GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
24
-
25
- // backend API
26
-
27
- // GGML_API ggml_backend_t ggml_backend_opencl_init(void);
28
-
29
- // GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
30
-
31
- GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
32
- // GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
33
-
34
- #ifdef __cplusplus
35
- }
36
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/sync-ggml-am.sh CHANGED
@@ -111,8 +111,6 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
111
  # src/ggml-kompute.h -> ggml-kompute.h
112
  # src/ggml-metal.h -> ggml-metal.h
113
  # src/ggml-metal.m -> ggml-metal.m
114
- # src/ggml-opencl.cpp -> ggml-opencl.cpp
115
- # src/ggml-opencl.h -> ggml-opencl.h
116
  # src/ggml-quants.c -> ggml-quants.c
117
  # src/ggml-quants.h -> ggml-quants.h
118
  # src/ggml-rpc.cpp -> ggml-rpc.cpp
@@ -155,8 +153,6 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
155
  -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
156
  -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
157
  -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
158
- -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
159
- -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
160
  -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
161
  -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
162
  -e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
 
111
  # src/ggml-kompute.h -> ggml-kompute.h
112
  # src/ggml-metal.h -> ggml-metal.h
113
  # src/ggml-metal.m -> ggml-metal.m
 
 
114
  # src/ggml-quants.c -> ggml-quants.c
115
  # src/ggml-quants.h -> ggml-quants.h
116
  # src/ggml-rpc.cpp -> ggml-rpc.cpp
 
153
  -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
154
  -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
155
  -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
 
 
156
  -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
157
  -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
158
  -e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
scripts/sync-ggml.sh CHANGED
@@ -14,8 +14,6 @@ cp -rpv ../ggml/src/ggml-kompute.h ./ggml-kompute.h
14
  cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
15
  cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
16
  cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
17
- cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
18
- cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
19
  cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
20
  cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
21
  cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp
 
14
  cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
15
  cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
16
  cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
 
 
17
  cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
18
  cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
19
  cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp