ggerganov commited on
Commit
ced3ca3
·
unverified ·
1 Parent(s): 29932e6

opencl : sync latest ggml-opencl

Browse files
Files changed (5) hide show
  1. CMakeLists.txt +1 -1
  2. Makefile +2 -2
  3. extra/sync-ggml.sh +1 -1
  4. ggml-opencl.c +0 -361
  5. ggml-opencl.cpp +1684 -0
CMakeLists.txt CHANGED
@@ -182,7 +182,7 @@ if (WHISPER_CLBLAST)
182
  if (CLBlast_FOUND)
183
  message(STATUS "CLBlast found")
184
 
185
- set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
186
 
187
  add_compile_definitions(GGML_USE_CLBLAST)
188
 
 
182
  if (CLBlast_FOUND)
183
  message(STATUS "CLBlast found")
184
 
185
+ set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
186
 
187
  add_compile_definitions(GGML_USE_CLBLAST)
188
 
Makefile CHANGED
@@ -175,8 +175,8 @@ ifdef WHISPER_CLBLAST
175
  CFLAGS += -DGGML_USE_CLBLAST
176
  LDFLAGS += -lclblast -lOpenCL
177
  WHISPER_OBJ += ggml-opencl.o
178
-
179
- ggml-opencl.o: ggml-opencl.c ggml-opencl.h
180
  $(CC) $(CFLAGS) -c $< -o $@
181
  endif
182
 
 
175
  CFLAGS += -DGGML_USE_CLBLAST
176
  LDFLAGS += -lclblast -lOpenCL
177
  WHISPER_OBJ += ggml-opencl.o
178
+
179
+ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
180
  $(CC) $(CFLAGS) -c $< -o $@
181
  endif
182
 
extra/sync-ggml.sh CHANGED
@@ -4,7 +4,7 @@ cp -rpv ../ggml/src/ggml.c ./ggml.c
4
  cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
5
  cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
6
  cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
7
- cp -rpv ../ggml/src/ggml-opencl.c ./ggml-opencl.c
8
  cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
9
  cp -rpv ../ggml/examples/common.h ./examples/common.h
10
  cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp
 
4
  cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
5
  cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
6
  cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
7
+ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
8
  cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
9
  cp -rpv ../ggml/examples/common.h ./examples/common.h
10
  cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp
ggml-opencl.c DELETED
@@ -1,361 +0,0 @@
1
- #include "ggml-opencl.h"
2
-
3
- #define CL_TARGET_OPENCL_VERSION 110
4
- #include <clblast_c.h>
5
-
6
- #include <stdlib.h>
7
- #include <stdio.h>
8
- #include <string.h>
9
-
10
- #include "ggml.h"
11
-
12
- #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
- const char * clblast_dequant = MULTILINE_QUOTE(
14
-
15
- typedef uchar uint8_t;
16
- typedef int int32_t;
17
- typedef uint uint32_t;
18
-
19
- constant uint QK4_0 = 32;
20
- struct block_q4_0
21
- {
22
- float d;
23
- uint8_t qs[QK4_0 / 2];
24
- };
25
-
26
- constant uint QK4_1 = 32;
27
- struct block_q4_1
28
- {
29
- float d;
30
- float m;
31
- uint8_t qs[QK4_1 / 2];
32
- };
33
-
34
- constant uint QK5_0 = 32;
35
- struct __attribute__ ((packed)) block_q5_0
36
- {
37
- half d;
38
- uint32_t qh;
39
- uint8_t qs[QK5_0 / 2];
40
- };
41
-
42
- constant uint QK5_1 = 32;
43
- struct block_q5_1
44
- {
45
- half d;
46
- half m;
47
- uint32_t qh;
48
- uint8_t qs[QK5_1 / 2];
49
- };
50
-
51
- constant uint QK8_0 = 32;
52
- struct block_q8_0
53
- {
54
- float d;
55
- uint8_t qs[QK8_0];
56
- };
57
-
58
-
59
- __kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
60
- constant uint qk = QK4_0;
61
-
62
- const uint i = get_global_id(0) / qk;
63
- const uint j = get_local_id(0);
64
-
65
- const float d = x[i].d;
66
-
67
- const int x0 = (x[i].qs[j] & 0xf) - 8;
68
- const int x1 = (x[i].qs[j] >> 4) - 8;
69
-
70
- y[i*qk + j + 0 ] = x0*d;
71
- y[i*qk + j + qk/2] = x1*d;
72
- }
73
-
74
- __kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
75
- constant uint qk = QK4_1;
76
-
77
- const uint i = get_global_id(0) / qk;
78
- const uint j = get_local_id(0);
79
-
80
- const float d = x[i].d;
81
- const float m = x[i].m;
82
-
83
- const int x0 = (x[i].qs[j] & 0xf);
84
- const int x1 = (x[i].qs[j] >> 4);
85
-
86
- y[i*qk + j + 0 ] = x0*d + m;
87
- y[i*qk + j + qk/2] = x1*d + m;
88
- }
89
-
90
- __kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
91
- constant uint qk = QK5_0;
92
-
93
- const uint i = get_global_id(0) / qk;
94
- const uint j = get_local_id(0);
95
-
96
- const float d = vload_half(0, (__global half*) &x[i].d);
97
-
98
- uint32_t qh = x[i].qh;
99
-
100
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
101
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
102
-
103
- const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
104
- const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
105
-
106
- y[i*qk + j + 0 ] = x0*d;
107
- y[i*qk + j + qk/2] = x1*d;
108
- }
109
-
110
- __kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
111
- constant uint qk = QK5_1;
112
-
113
- const uint i = get_global_id(0) / qk;
114
- const uint j = get_local_id(0);
115
-
116
- const float d = vload_half(0, (__global half*) &x[i].d);
117
- const float m = vload_half(0, (__global half*) &x[i].m);
118
-
119
- uint32_t qh = x[i].qh;
120
-
121
- const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
122
- const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
123
-
124
- const int x0 = (x[i].qs[j] & 0xf) | xh_0;
125
- const int x1 = (x[i].qs[j] >> 4) | xh_1;
126
-
127
- y[i*qk + j + 0 ] = x0*d + m;
128
- y[i*qk + j + qk/2] = x1*d + m;
129
- }
130
-
131
- __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
132
- constant uint qk = QK8_0;
133
- const uint i = get_global_id(0) / qk;
134
- const uint j = get_local_id(0);
135
-
136
- const float d = x[i].d;
137
- y[i*qk + j] = x[i].qs[j]*d;
138
- }
139
-
140
- );
141
-
142
- #define CL_CHECK(err, name) \
143
- do { \
144
- cl_int err_ = (err); \
145
- if (err_ != CL_SUCCESS) { \
146
- fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
147
- exit(1); \
148
- } \
149
- } while (0)
150
-
151
- static cl_platform_id platform;
152
- static cl_device_id device;
153
- static cl_context context;
154
- static cl_command_queue queue;
155
- static cl_program program;
156
- static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
157
- static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
158
- static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
159
-
160
- static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
161
- cl_program p;
162
- char *program_log;
163
- size_t program_size, log_size;
164
- int err;
165
-
166
- program_size = strlen(program_buffer);
167
-
168
- p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
169
- if(err < 0) {
170
- fprintf(stderr, "OpenCL error creating program");
171
- exit(1);
172
- }
173
-
174
- err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
175
- if(err < 0) {
176
-
177
- clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
178
- program_log = (char*) malloc(log_size + 1);
179
- program_log[log_size] = '\0';
180
- clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
181
- printf("%s\n", program_log);
182
- free(program_log);
183
- exit(1);
184
- }
185
-
186
- return p;
187
- }
188
-
189
- void ggml_cl_init(void) {
190
- cl_int err = 0;
191
- char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
192
- char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
193
- int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
194
- int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
195
- printf("\nInitializing CLBlast (First Run)...");
196
- printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
197
- cl_uint num_platforms;
198
- clGetPlatformIDs(0, NULL, &num_platforms);
199
- cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
200
- clGetPlatformIDs(num_platforms, platforms, NULL);
201
- platform = platforms[plat_num];
202
- char platform_buffer[1024];
203
- clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
204
- cl_uint num_devices;
205
- clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
206
- cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
207
- clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
208
- device = devices[dev_num];
209
- char device_buffer[1024];
210
- clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
211
- printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
212
- context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
213
- CL_CHECK(err, "clCreateContext");
214
- queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
215
- CL_CHECK(err, "clCreateCommandQueue");
216
-
217
- free(platforms);
218
- free(devices);
219
-
220
- program = build_program_from_source(context, device, clblast_dequant);
221
-
222
- // Prepare dequantize kernels
223
- kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
224
- CL_CHECK(err, "clCreateKernel");
225
- kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
226
- CL_CHECK(err, "clCreateKernel");
227
- kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
228
- CL_CHECK(err, "clCreateKernel");
229
- kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
230
- CL_CHECK(err, "clCreateKernel");
231
- kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
232
- CL_CHECK(err, "clCreateKernel");
233
- }
234
-
235
- static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
236
- if (req_size <= *cur_size) {
237
- return;
238
- }
239
-
240
- // Reallocate buffer with enough space
241
- if (*cur_size > 0) {
242
- clReleaseMemObject(*buf);
243
- }
244
- cl_int err;
245
- *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
246
- *cur_size = req_size;
247
- CL_CHECK(err, "clCreateBuffer");
248
- }
249
-
250
- void ggml_cl_sgemm_wrapper(
251
- const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
252
- const int m, const int n, const int k,
253
- const float alpha, const void *host_a, const int lda,
254
- const float *host_b, const int ldb, const float beta,
255
- float *host_c, const int ldc, const int btype) {
256
- cl_int err = 0;
257
-
258
- cl_kernel kernel;
259
- size_t global = n * k, local, size_qb;
260
- bool dequant;
261
-
262
- switch (btype) {
263
- case GGML_TYPE_F32:
264
- dequant = false;
265
- break;
266
- case GGML_TYPE_Q4_0:
267
- dequant = true;
268
- kernel = kernel_q4_0;
269
- local = 16;
270
- size_qb = global * (sizeof(float) + local) / 32;
271
- break;
272
- case GGML_TYPE_Q4_1:
273
- dequant = true;
274
- kernel = kernel_q4_1;
275
- local = 16;
276
- size_qb = global * (sizeof(float) * 2 + local) / 32;
277
- break;
278
- case GGML_TYPE_Q5_0:
279
- dequant = true;
280
- kernel = kernel_q5_0;
281
- local = 16;
282
- size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
283
- break;
284
- case GGML_TYPE_Q5_1:
285
- dequant = true;
286
- kernel = kernel_q5_1;
287
- local = 16;
288
- size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
289
- break;
290
- case GGML_TYPE_Q8_0:
291
- dequant = true;
292
- kernel = kernel_q8_0;
293
- local = 32;
294
- size_qb = global * (sizeof(float) + local) / 32;
295
- break;
296
- default:
297
- fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
298
- abort();
299
- }
300
-
301
- const size_t size_a = m * k * sizeof(float);
302
- const size_t size_b = n * k * sizeof(float);
303
- const size_t size_c = m * n * sizeof(float);
304
-
305
- // Prepare buffers
306
- ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
307
- if (dequant) {
308
- ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
309
- }
310
- ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
311
- ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
312
-
313
- cl_event ev_a, ev_qb, ev_b;
314
-
315
- if (dequant) {
316
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
317
- err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
318
- CL_CHECK(err, "clSetKernelArg");
319
- err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
320
- CL_CHECK(err, "clEnqueueWriteBuffer qb");
321
- } else {
322
- err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
323
- CL_CHECK(err, "clEnqueueWriteBuffer b");
324
- }
325
-
326
- err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
327
- CL_CHECK(err, "clEnqueueWriteBuffer a");
328
- if (dequant) {
329
- err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
330
- CL_CHECK(err, "clEnqueueNDRangeKernel");
331
- clReleaseEvent(ev_qb);
332
- }
333
- clWaitForEvents(1, &ev_a);
334
- clWaitForEvents(1, &ev_b);
335
- clReleaseEvent(ev_a);
336
- clReleaseEvent(ev_b);
337
-
338
- cl_event ev_sgemm;
339
- CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
340
- (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
341
- m, n, k,
342
- alpha,
343
- cl_buffer_a, 0, lda,
344
- cl_buffer_b, 0, ldb,
345
- beta,
346
- cl_buffer_c, 0, ldc,
347
- &queue, &ev_sgemm);
348
-
349
- if (status != CLBlastSuccess) {
350
- fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
351
- abort();
352
- }
353
-
354
- cl_event ev_c;
355
- clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
356
-
357
- // Wait for completion
358
- clWaitForEvents(1, &ev_c);
359
- clReleaseEvent(ev_sgemm);
360
- clReleaseEvent(ev_c);
361
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ggml-opencl.cpp ADDED
@@ -0,0 +1,1684 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-opencl.h"
2
+
3
+ #include <array>
4
+ #include <atomic>
5
+ #include <sstream>
6
+ #include <vector>
7
+ #include <limits>
8
+
9
+ #define CL_TARGET_OPENCL_VERSION 110
10
+ #include <clblast.h>
11
+
12
+ #include <stdlib.h>
13
+ #include <stdio.h>
14
+ #include <string.h>
15
+
16
+ #include "ggml.h"
17
+
18
+ #if defined(_MSC_VER)
19
+ #pragma warning(disable: 4244 4267) // possible loss of data
20
+ #endif
21
+
22
+ #define CL_DMMV_BLOCK_SIZE 32
23
+
24
+ #define MULTILINE_QUOTE(...) #__VA_ARGS__
25
+ static std::string program_source = MULTILINE_QUOTE(
26
+
27
+ typedef char int8_t;
28
+ typedef uchar uint8_t;
29
+ typedef int int32_t;
30
+ typedef uint uint32_t;
31
+
32
+ struct __attribute__ ((packed)) block_q4_0
33
+ {
34
+ half d;
35
+ uint8_t qs[QK4_0 / 2];
36
+ };
37
+
38
+ struct __attribute__ ((packed)) block_q4_1
39
+ {
40
+ half d;
41
+ half m;
42
+ uint8_t qs[QK4_1 / 2];
43
+ };
44
+
45
+ struct __attribute__ ((packed)) block_q5_0
46
+ {
47
+ half d;
48
+ uint32_t qh;
49
+ uint8_t qs[QK5_0 / 2];
50
+ };
51
+
52
+ struct __attribute__ ((packed)) block_q5_1
53
+ {
54
+ half d;
55
+ half m;
56
+ uint32_t qh;
57
+ uint8_t qs[QK5_1 / 2];
58
+ };
59
+
60
+ struct __attribute__ ((packed)) block_q8_0
61
+ {
62
+ half d;
63
+ int8_t qs[QK8_0];
64
+ };
65
+
66
+ struct __attribute__((packed)) block_q2_K
67
+ {
68
+ uint8_t scales[16];
69
+ uint8_t qs[64];
70
+ half d;
71
+ half dmin;
72
+ };
73
+
74
+ struct __attribute__((packed)) block_q3_K
75
+ {
76
+ uint8_t hmask[32];
77
+ uint8_t qs[64];
78
+ uint8_t scales[12];
79
+ half d;
80
+ };
81
+
82
+ struct __attribute__((packed)) block_q4_K
83
+ {
84
+ half d;
85
+ half dmin;
86
+ uint8_t scales[12];
87
+ uint8_t qs[128];
88
+ };
89
+
90
+ struct __attribute__((packed)) block_q5_K
91
+ {
92
+ half d;
93
+ half dmin;
94
+ uint8_t scales[12];
95
+ uint8_t qh[32];
96
+ uint8_t qs[128];
97
+ };
98
+
99
+ struct __attribute__((packed)) block_q6_K
100
+ {
101
+ uint8_t ql[128];
102
+ uint8_t qh[64];
103
+ int8_t scales[16];
104
+ half d;
105
+ };
106
+
107
+ __kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
108
+ const uint i = get_global_id(0);
109
+
110
+ y[i] = vload_half(0, &x[i]);
111
+ }
112
+
113
+ void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
114
+ const float d = vload_half(0, &x[ib].d);
115
+
116
+ const uint8_t vui = x[ib].qs[iqs];
117
+
118
+ const int8_t vi0 = vui & 0xF;
119
+ const int8_t vi1 = vui >> 4;
120
+
121
+ *v0 = (vi0 - 8)*d;
122
+ *v1 = (vi1 - 8)*d;
123
+ }
124
+ void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
125
+ const float d = vload_half(0, &x[ib].d);
126
+ const float m = vload_half(0, &x[ib].m);
127
+
128
+ const uint8_t vui = x[ib].qs[iqs];
129
+
130
+ const int8_t vi0 = vui & 0xF;
131
+ const int8_t vi1 = vui >> 4;
132
+
133
+ *v0 = vi0*d + m;
134
+ *v1 = vi1*d + m;
135
+ }
136
+ void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
137
+ const float d = vload_half(0, &x[ib].d);
138
+
139
+ uint32_t qh = x[ib].qh;
140
+
141
+ const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
142
+ const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
143
+
144
+ const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
145
+ const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
146
+
147
+ *v0 = x0*d;
148
+ *v1 = x1*d;
149
+ }
150
+ void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
151
+ const float d = vload_half(0, &x[ib].d);
152
+ const float m = vload_half(0, &x[ib].m);
153
+
154
+ uint32_t qh = x[ib].qh;
155
+
156
+ const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
157
+ const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
158
+
159
+ const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
160
+ const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
161
+
162
+ *v0 = x0*d + m;
163
+ *v1 = x1*d + m;
164
+ }
165
+ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
166
+ const float d = vload_half(0, &x[ib].d);
167
+
168
+ const int8_t vi0 = x[ib].qs[iqs + 0];
169
+ const int8_t vi1 = x[ib].qs[iqs + 1];
170
+
171
+ *v0 = vi0*d;
172
+ *v1 = vi1*d;
173
+ }
174
+ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
175
+ *v0 = vload_half(0, &x[ib + 0]);
176
+ *v1 = vload_half(0, &x[ib + 1]);
177
+ }
178
+
179
+ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
180
+ {
181
+ if (j < 4)
182
+ {
183
+ *d = q[j] & 63;
184
+ *m = q[j + 4] & 63;
185
+ }
186
+ else
187
+ {
188
+ *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
189
+ *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
190
+ }
191
+ }
192
+
193
+ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
194
+ {
195
+ const int i = get_group_id(0);
196
+ const int tid = get_local_id(0);
197
+ const int n = tid / 32;
198
+ const int l = tid - 32 * n;
199
+ const int is = 8 * n + l / 16;
200
+
201
+ const uint8_t q = x[i].qs[32 * n + l];
202
+ __global float *y = yy + i * 256 + 128 * n;
203
+
204
+ const float dall = vload_half(0, &x[i].d);
205
+ const float dmin = vload_half(0, &x[i].dmin);
206
+
207
+ y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4);
208
+ y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4);
209
+ y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4);
210
+ y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
211
+ }
212
+
213
+ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
214
+ {
215
+ int r = get_local_id(0) / 4;
216
+ int i = get_group_id(0);
217
+ int tid = r / 2;
218
+ int is0 = r % 2;
219
+ int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
220
+ int n = tid / 4;
221
+ int j = tid - 4 * n;
222
+
223
+ uint8_t m = 1 << (4 * n + j);
224
+ int is = 8 * n + 2 * j + is0;
225
+ int shift = 2 * j;
226
+
227
+ int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4)
228
+ : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
229
+ : is < 12 ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
230
+ : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
231
+ float d_all = vload_half(0, &x[i].d);
232
+ float dl = d_all * (us - 32);
233
+
234
+ __global float *y = yy + i * 256 + 128 * n + 32 * j;
235
+ const __global uint8_t *q = x[i].qs + 32 * n;
236
+ const __global uint8_t *hm = x[i].hmask;
237
+
238
+ for (int l = l0; l < l0 + 4; ++l)
239
+ y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
240
+ }
241
+
242
+ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
243
+ {
244
+ const int i = get_group_id(0);
245
+ const int tid = get_local_id(0);
246
+ const int il = tid / 8;
247
+ const int ir = tid % 8;
248
+ const int is = 2 * il;
249
+ const int n = 4;
250
+
251
+ __global float *y = yy + i * 256 + 64 * il + n * ir;
252
+
253
+ const float dall = vload_half(0, &x[i].d);
254
+ const float dmin = vload_half(0, &x[i].dmin);
255
+
256
+ __global const uint8_t *q = x[i].qs + 32 * il + n * ir;
257
+
258
+ uint8_t sc, m;
259
+ get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
260
+ float d1 = dall * sc;
261
+ float m1 = dmin * m;
262
+ get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
263
+ float d2 = dall * sc;
264
+ float m2 = dmin * m;
265
+ for (int l = 0; l < n; ++l)
266
+ {
267
+ y[l + 0] = d1 * (q[l] & 0xF) - m1;
268
+ y[l + 32] = d2 * (q[l] >> 4) - m2;
269
+ }
270
+ }
271
+
272
+ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
273
+ {
274
+ const int i = get_group_id(0);
275
+ const int tid = get_local_id(0);
276
+ const int il = tid / 16;
277
+ const int ir = tid % 16;
278
+ const int is = 2 * il;
279
+
280
+ __global float *y = yy + i * 256 + 64 * il + 2 * ir;
281
+
282
+ const float dall = vload_half(0, &x[i].d);
283
+ const float dmin = vload_half(0, &x[i].dmin);
284
+
285
+ __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
286
+ __global const uint8_t *qh = x[i].qh + 2 * ir;
287
+
288
+ uint8_t sc, m;
289
+ get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
290
+ const float d1 = dall * sc;
291
+ const float m1 = dmin * m;
292
+ get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
293
+ const float d2 = dall * sc;
294
+ const float m2 = dmin * m;
295
+
296
+ uint8_t hm = 1 << (2 * il);
297
+ y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
298
+ y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
299
+ hm <<= 1;
300
+ y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
301
+ y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
302
+ }
303
+
304
+ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
305
+ {
306
+ const int i = get_group_id(0);
307
+ const int tid = get_local_id(0);
308
+ const int ip = tid / 32;
309
+ const int il = tid - 32 * ip;
310
+ const int is = 8 * ip + il / 16;
311
+
312
+ __global float *y = yy + i * 256 + 128 * ip + il;
313
+
314
+ const float d = vload_half(0, &x[i].d);
315
+
316
+ __global const uint8_t *ql = x[i].ql + 64 * ip + il;
317
+ const uint8_t qh = x[i].qh[32 * ip + il];
318
+ __global const int8_t *sc = x[i].scales + is;
319
+
320
+ y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
321
+ y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
322
+ y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
323
+ y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
324
+ }
325
+
326
+
327
+ void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
328
+
329
+ int n = iqs / 128;
330
+ int r = iqs - 128 * n;
331
+ int l = r / 8;
332
+
333
+ __global const float *y = yy + 128 * n + l;
334
+ __global const uint8_t *q = x[ib].qs + 32 * n + l;
335
+ __global const uint8_t *s = x[ib].scales + 8 * n;
336
+
337
+ const float dall = vload_half(0, &x[ib].d);
338
+ const float dmin = vload_half(0, &x[ib].dmin);
339
+
340
+ float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
341
+ + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
342
+ + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
343
+ + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
344
+ + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
345
+ + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
346
+ + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
347
+ + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
348
+
349
+ *result = sum;
350
+ }
351
+
352
+ void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
353
+
354
+ const uint32_t kmask1 = 0x03030303;
355
+ const uint32_t kmask2 = 0x0f0f0f0f;
356
+
357
+ uint32_t aux[3];
358
+ uint32_t utmp[4];
359
+
360
+ int n = iqs/128;
361
+ int r = iqs - 128*n;
362
+ int l = r/8;
363
+
364
+ __global const float * y = yy + 128*n + l;
365
+ __global const uint8_t * q = x[ib].qs + 32*n + l;
366
+ __global const uint8_t * hm = x[ib].hmask + l;
367
+ const int8_t * s = (const int8_t *)utmp + 8*n;
368
+
369
+ aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24;
370
+ aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24;
371
+ aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24;
372
+
373
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
374
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
375
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
376
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
377
+
378
+ const float dall = vload_half(0, &x[ib].d);
379
+ const uint8_t m = 1 << (4*n);
380
+
381
+ float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
382
+ + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
383
+ + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
384
+ + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
385
+ + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
386
+ + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
387
+ + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
388
+ + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
389
+
390
+ *result = sum * dall;
391
+
392
+ }
393
+
394
+ void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
395
+
396
+ const int j = iqs / 64; // j is in 0...3
397
+ const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
398
+ const int is = 2*j; // is is in 0...6 in steps of 2
399
+
400
+ __global const float * y = yy + 64*j + ir;
401
+ __global const uint8_t * q = x[ib].qs + 32*j + ir;
402
+
403
+ const float dall = vload_half(0, &x[ib].d);
404
+ const float dmin = vload_half(0, &x[ib].dmin);
405
+
406
+ uint8_t sc, m;
407
+ get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
408
+ const float d1 = dall * sc;
409
+ const float m1 = dmin * m;
410
+ get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
411
+ const float d2 = dall * sc;
412
+ const float m2 = dmin * m;
413
+
414
+ float sum = 0;
415
+ for (int k = 0; k < 4; ++k) {
416
+ sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
417
+ sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
418
+ }
419
+
420
+ *result = sum;
421
+ }
422
+
423
+ void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
424
+
425
+ const int j = iqs / 64;
426
+ const int ir = (iqs - 64*j)/2;
427
+ const int is = 2*j;
428
+
429
+ __global const float * y = yy + 64*j + ir;
430
+ __global const uint8_t * ql = x[ib].qs + 32*j + ir;
431
+ __global const uint8_t * qh = x[ib].qh + ir;
432
+
433
+ const float dall = vload_half(0, &x[ib].d);
434
+ const float dmin = vload_half(0, &x[ib].dmin);
435
+
436
+ uint8_t sc, m;
437
+ get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
438
+ const float d1 = dall * sc;
439
+ const float m1 = dmin * m;
440
+ get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
441
+ const float d2 = dall * sc;
442
+ const float m2 = dmin * m;
443
+
444
+ uint8_t hm = 1 << is;
445
+ float sum = 0;
446
+ for (int k = 0; k < 4; ++k) {
447
+ sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
448
+ }
449
+ hm <<= 1;
450
+ for (int k = 0; k < 4; ++k) {
451
+ sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
452
+ }
453
+ *result = sum;
454
+
455
+ }
456
+
457
+ void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
458
+
459
+
460
+ const int ip = iqs / 128; // 0 or 1
461
+ const int il = (iqs - 128*ip)/8; // 0...15
462
+ const int is = 8*ip;
463
+
464
+ __global const float * y = yy + 128*ip + il;
465
+
466
+ const float d = vload_half(0, &x[ib].d);
467
+
468
+ __global const uint8_t * ql = x[ib].ql + 64*ip + il;
469
+ __global const uint8_t * qh = x[ib].qh + 32*ip + il;
470
+ __global const int8_t * sc = x[ib].scales + is;
471
+
472
+ *result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
473
+ + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
474
+ + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
475
+ + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
476
+ + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
477
+ + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
478
+ + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
479
+ + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
480
+
481
+ }
482
+
483
+ );
484
+
485
+
486
+ std::string dequant_template = MULTILINE_QUOTE(
487
+ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
488
+ const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
489
+
490
+ if (i >= get_global_size(0)) {
491
+ return;
492
+ }
493
+
494
+ const uint qk = QUANT_K;
495
+ const uint qr = QUANT_R;
496
+
497
+ const int ib = i/qk; // block index
498
+ const int iqs = (i%qk)/qr; // quant index
499
+ const int iybs = i - i%qk; // y block start index
500
+ const int y_offset = qr == 1 ? 1 : qk/2;
501
+
502
+ // dequantize
503
+ float v0, v1;
504
+ DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
505
+ y[iybs + iqs + 0] = v0;
506
+ y[iybs + iqs + y_offset] = v1;
507
+ }
508
+ );
509
+
510
+ std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
511
+ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
512
+ const int block_size = get_local_size(0);
513
+ const int row = get_group_id(0);
514
+ const int tid = get_local_id(0);
515
+
516
+ const uint qk = QUANT_K;
517
+ const uint qr = QUANT_R;
518
+
519
+ const int y_offset = qr == 1 ? 1 : qk/2;
520
+
521
+ tmp[tid] = 0;
522
+
523
+ for (int i = 0; i < ncols/block_size; i += 2) {
524
+ const int col = i*block_size + 2*tid;
525
+ const int ib = (row*ncols + col)/qk; // block index
526
+ const int iqs = (col%qk)/qr; // quant index
527
+ const int iybs = col - col%qk; // y block start index
528
+
529
+ // dequantize
530
+ float v0, v1;
531
+ DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
532
+
533
+ // matrix multiplication
534
+ tmp[tid] += v0 * y[iybs + iqs + 0];
535
+ tmp[tid] += v1 * y[iybs + iqs + y_offset];
536
+ }
537
+
538
+ // sum up partial sums and write back result
539
+ barrier(CLK_LOCAL_MEM_FENCE);
540
+ for (int s=block_size/2; s>0; s>>=1) {
541
+ if (tid < s) {
542
+ tmp[tid] += tmp[tid + s];
543
+ }
544
+ barrier(CLK_LOCAL_MEM_FENCE);
545
+ }
546
+ if (tid == 0) {
547
+ dst[row] = tmp[0];
548
+ }
549
+ }
550
+ );
551
+
552
+ std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
553
+ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
554
+ const int block_size = get_local_size(0);
555
+ const int row = get_group_id(0);
556
+ const int tid = get_local_id(0);
557
+
558
+ const int iter_stride = 256;
559
+ const int vals_per_iter = iter_stride / block_size;
560
+ const int num_blocks_per_row = ncols / 256;
561
+ const int ib0 = row*num_blocks_per_row;
562
+
563
+ tmp[tid] = 0;
564
+
565
+ for (int i = 0; i < ncols; i += iter_stride) {
566
+ const int col = i + vals_per_iter*tid;
567
+ const int ib = ib0 + col/256; // x block index
568
+ const int iqs = col%256; // x quant index
569
+ const int iybs = col - col%256; // y block start index
570
+
571
+ // dequantize
572
+ float v;
573
+ DOT_KERNEL(x, ib, iqs, y + iybs, &v);
574
+ tmp[tid] += v;
575
+ }
576
+
577
+ // sum up partial sums and write back result
578
+ barrier(CLK_LOCAL_MEM_FENCE);
579
+ for (int s=block_size/2; s>0; s>>=1) {
580
+ if (tid < s) {
581
+ tmp[tid] += tmp[tid + s];
582
+ }
583
+ barrier(CLK_LOCAL_MEM_FENCE);
584
+ }
585
+ if (tid == 0) {
586
+ dst[row] = tmp[0];
587
+ }
588
+ }
589
+ );
590
+
591
+ std::string mul_template = MULTILINE_QUOTE(
592
+ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
593
+ const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
594
+
595
+ if (i >= get_global_size(0)) {
596
+ return;
597
+ }
598
+
599
+ dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
600
+ }
601
+ );
602
+
603
+ #define CL_CHECK(err) \
604
+ do { \
605
+ cl_int err_ = (err); \
606
+ if (err_ != CL_SUCCESS) { \
607
+ fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
608
+ #err, err_, __FILE__, __LINE__); \
609
+ exit(1); \
610
+ } \
611
+ } while (0)
612
+
613
+ #define CLBLAST_CHECK(err) \
614
+ do { \
615
+ CLBlastStatusCode err_ = (err); \
616
+ if (err_ != CLBlastSuccess) { \
617
+ fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \
618
+ #err, err_, __FILE__, __LINE__); \
619
+ exit(1); \
620
+ } \
621
+ } while (0)
622
+
623
+ std::array<std::string, 5> dequant_str_keys = {
624
+ "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
625
+ };
626
+
627
+ std::array<std::string, 30> dequant_str_values = {
628
+ "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
629
+ "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
630
+ "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
631
+ "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
632
+ "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
633
+ "convert_row_f16", "half", "1", "1", "convert_f16"
634
+ };
635
+
636
+ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
637
+ "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
638
+ "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
639
+ "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
640
+ "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
641
+ "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
642
+ "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
643
+ };
644
+
645
+ std::array<std::string, 2> mul_str_keys = {
646
+ "KERNEL_NAME", "TYPE"
647
+ };
648
+ std::array<std::string, 2> mul_str_values = {
649
+ "mul_f32", "float"
650
+ };
651
+
652
+ std::array<std::string, 3> dmmv_k_str_keys = {
653
+ "KERNEL_NAME", "X_TYPE", "DOT_KERNEL"
654
+ };
655
+
656
+ std::array<std::string, 15> dmmv_k_str_values = {
657
+ "dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
658
+ "dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
659
+ "dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K",
660
+ "dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K",
661
+ "dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K",
662
+ };
663
+
664
+ std::string& replace(std::string& s, const std::string& from, const std::string& to) {
665
+ size_t pos = 0;
666
+ while ((pos = s.find(from, pos)) != std::string::npos) {
667
+ s.replace(pos, from.length(), to);
668
+ pos += to.length();
669
+ }
670
+ return s;
671
+ }
672
+
673
+ std::string generate_kernels() {
674
+ std::stringstream src;
675
+ src << program_source << '\n';
676
+ for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
677
+ std::string dequant_kernel = dequant_template;
678
+ std::string dmmv_kernel = dequant_mul_mat_vec_template;
679
+ for (size_t j = 0; j < dequant_str_keys.size(); j++) {
680
+ replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
681
+ replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
682
+ }
683
+ src << dequant_kernel << '\n';
684
+ src << dmmv_kernel << '\n';
685
+ }
686
+ for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
687
+ std::string mul_kernel = mul_template;
688
+ for (size_t j = 0; j < mul_str_keys.size(); j++) {
689
+ replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
690
+ }
691
+ src << mul_kernel << '\n';
692
+ }
693
+ for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) {
694
+ std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template;
695
+ for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) {
696
+ replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]);
697
+ }
698
+ src << dmmv_k_kernel << '\n';
699
+ }
700
+
701
+ return src.str();
702
+ }
703
+
704
+ static cl_platform_id platform;
705
+ static cl_device_id device;
706
+ static cl_context context;
707
+ static cl_command_queue queue;
708
+ static cl_program program;
709
+ static cl_kernel convert_row_f16_cl;
710
+ static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
711
+ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
712
+ static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
713
+ static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
714
+ static cl_kernel mul_f32_cl;
715
+ static bool fp16_support;
716
+
717
+ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
718
+ cl_program p;
719
+ char *program_log;
720
+ size_t program_size;
721
+ size_t log_size;
722
+ int err;
723
+
724
+ program_size = strlen(program_buffer);
725
+
726
+ p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
727
+ if(err < 0) {
728
+ fprintf(stderr, "OpenCL error creating program");
729
+ exit(1);
730
+ }
731
+
732
+ const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
733
+ "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1";
734
+
735
+ err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
736
+ if(err < 0) {
737
+
738
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
739
+ program_log = (char*) malloc(log_size + 1);
740
+ program_log[log_size] = '\0';
741
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
742
+ fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log);
743
+ free(program_log);
744
+ exit(1);
745
+ }
746
+
747
+ return p;
748
+ }
749
+
750
+ void ggml_cl_init(void) {
751
+ cl_int err;
752
+
753
+ struct cl_device;
754
+ struct cl_platform {
755
+ cl_platform_id id;
756
+ unsigned number;
757
+ char name[128];
758
+ char vendor[128];
759
+ struct cl_device * devices;
760
+ unsigned n_devices;
761
+ struct cl_device * default_device;
762
+ };
763
+
764
+ struct cl_device {
765
+ struct cl_platform * platform;
766
+ cl_device_id id;
767
+ unsigned number;
768
+ cl_device_type type;
769
+ char name[128];
770
+ };
771
+
772
+ enum { NPLAT = 16, NDEV = 16 };
773
+
774
+ struct cl_platform platforms[NPLAT];
775
+ unsigned n_platforms = 0;
776
+ struct cl_device devices[NDEV];
777
+ unsigned n_devices = 0;
778
+ struct cl_device * default_device = NULL;
779
+
780
+ platform = NULL;
781
+ device = NULL;
782
+
783
+ cl_platform_id platform_ids[NPLAT];
784
+ CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
785
+
786
+ for (unsigned i = 0; i < n_platforms; i++) {
787
+ struct cl_platform * p = &platforms[i];
788
+ p->number = i;
789
+ p->id = platform_ids[i];
790
+ CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
791
+ CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
792
+
793
+ cl_device_id device_ids[NDEV];
794
+ cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
795
+ if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
796
+ p->n_devices = 0;
797
+ } else {
798
+ CL_CHECK(clGetDeviceIDsError);
799
+ }
800
+ p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
801
+ p->default_device = NULL;
802
+
803
+ for (unsigned j = 0; j < p->n_devices; j++) {
804
+ struct cl_device * d = &devices[n_devices];
805
+ d->number = n_devices++;
806
+ d->id = device_ids[j];
807
+ d->platform = p;
808
+ CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
809
+ CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
810
+
811
+ if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
812
+ p->default_device = d;
813
+ }
814
+ }
815
+
816
+ if (default_device == NULL && p->default_device != NULL) {
817
+ default_device = p->default_device;
818
+ }
819
+ }
820
+
821
+ if (n_devices == 0) {
822
+ fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
823
+ exit(1);
824
+ }
825
+
826
+ char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
827
+ char * user_device_string = getenv("GGML_OPENCL_DEVICE");
828
+ int user_platform_number = -1;
829
+ int user_device_number = -1;
830
+
831
+ unsigned n;
832
+ if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
833
+ user_platform_number = (int)n;
834
+ }
835
+ if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
836
+ user_device_number = (int)n;
837
+ }
838
+ if (user_platform_number != -1 && user_device_number != -1) {
839
+ cl_platform* platform = &platforms[user_platform_number];
840
+ if ((unsigned)user_device_number >= platform->n_devices) {
841
+ fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number);
842
+ exit(1);
843
+ }
844
+ default_device = &platform->devices[user_device_number];
845
+ } else {
846
+
847
+ struct cl_device * selected_devices = devices;
848
+ unsigned n_selected_devices = n_devices;
849
+
850
+ if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
851
+ for (unsigned i = 0; i < n_platforms; i++) {
852
+ struct cl_platform * p = &platforms[i];
853
+ if (strstr(p->name, user_platform_string) != NULL ||
854
+ strstr(p->vendor, user_platform_string) != NULL) {
855
+ user_platform_number = (int)i;
856
+ break;
857
+ }
858
+ }
859
+ if (user_platform_number == -1) {
860
+ fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
861
+ exit(1);
862
+ }
863
+ }
864
+ if (user_platform_number != -1) {
865
+ struct cl_platform * p = &platforms[user_platform_number];
866
+ selected_devices = p->devices;
867
+ n_selected_devices = p->n_devices;
868
+ default_device = p->default_device;
869
+ if (n_selected_devices == 0) {
870
+ fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
871
+ exit(1);
872
+ }
873
+ }
874
+
875
+ if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
876
+ for (unsigned i = 0; i < n_selected_devices; i++) {
877
+ struct cl_device * d = &selected_devices[i];
878
+ if (strstr(d->name, user_device_string) != NULL) {
879
+ user_device_number = d->number;
880
+ break;
881
+ }
882
+ }
883
+ if (user_device_number == -1) {
884
+ fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
885
+ exit(1);
886
+ }
887
+ }
888
+ if (user_device_number != -1) {
889
+ selected_devices = &devices[user_device_number];
890
+ n_selected_devices = 1;
891
+ default_device = &selected_devices[0];
892
+ }
893
+
894
+ GGML_ASSERT(n_selected_devices > 0);
895
+
896
+ if (default_device == NULL) {
897
+ default_device = &selected_devices[0];
898
+ }
899
+ }
900
+
901
+ fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
902
+ fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
903
+ if (default_device->type != CL_DEVICE_TYPE_GPU) {
904
+ fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
905
+ }
906
+
907
+ platform = default_device->platform->id;
908
+ device = default_device->id;
909
+
910
+ size_t ext_str_size;
911
+ clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
912
+ char *ext_buffer = (char *)alloca(ext_str_size + 1);
913
+ clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
914
+ ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
915
+ // Check if ext_buffer contains cl_khr_fp16
916
+ fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
917
+ fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
918
+
919
+ cl_context_properties properties[] = {
920
+ (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
921
+ };
922
+
923
+ CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
924
+
925
+ CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
926
+ (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
927
+ (queue = clCreateCommandQueue(context, device, 0, &err), err)
928
+ )));
929
+
930
+ const std::string kernel_src = generate_kernels();
931
+
932
+ program = build_program_from_source(context, device, kernel_src.c_str());
933
+
934
+ // FP16 to FP32 kernel
935
+ CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
936
+
937
+ // Dequantize kernels
938
+ CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
939
+ CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
940
+ CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
941
+ CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
942
+ CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
943
+ CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
944
+ CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
945
+ CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
946
+ CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
947
+ CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
948
+ CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
949
+
950
+ // dequant mul mat kernel
951
+ CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
952
+ CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
953
+ CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
954
+ CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
955
+ CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
956
+ CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
957
+ CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
958
+ CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
959
+ CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
960
+ CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
961
+ CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
962
+
963
+ // mul kernel
964
+ CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
965
+ }
966
+
967
+ static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
968
+ switch (type) {
969
+ case GGML_TYPE_Q4_0:
970
+ return &dequantize_row_q4_0_cl;
971
+ case GGML_TYPE_Q4_1:
972
+ return &dequantize_row_q4_1_cl;
973
+ case GGML_TYPE_Q5_0:
974
+ return &dequantize_row_q5_0_cl;
975
+ case GGML_TYPE_Q5_1:
976
+ return &dequantize_row_q5_1_cl;
977
+ case GGML_TYPE_Q8_0:
978
+ return &dequantize_row_q8_0_cl;
979
+ case GGML_TYPE_Q2_K:
980
+ return &dequantize_block_q2_k_cl;
981
+ case GGML_TYPE_Q3_K:
982
+ return &dequantize_block_q3_k_cl;
983
+ case GGML_TYPE_Q4_K:
984
+ return &dequantize_block_q4_k_cl;
985
+ case GGML_TYPE_Q5_K:
986
+ return &dequantize_block_q5_k_cl;
987
+ case GGML_TYPE_Q6_K:
988
+ return &dequantize_block_q6_k_cl;
989
+ case GGML_TYPE_F16:
990
+ return &convert_row_f16_cl;
991
+ default:
992
+ return nullptr;
993
+ }
994
+ }
995
+
996
+ static size_t ggml_cl_global_denom(ggml_type type) {
997
+ switch (type) {
998
+ case GGML_TYPE_Q4_0:
999
+ case GGML_TYPE_Q4_1:
1000
+ case GGML_TYPE_Q5_0:
1001
+ case GGML_TYPE_Q5_1:
1002
+ case GGML_TYPE_Q8_0:
1003
+ return 1;
1004
+ case GGML_TYPE_Q2_K:
1005
+ case GGML_TYPE_Q3_K:
1006
+ return 4;
1007
+ case GGML_TYPE_Q4_K:
1008
+ return 8;
1009
+ case GGML_TYPE_Q5_K:
1010
+ case GGML_TYPE_Q6_K:
1011
+ return 4;
1012
+ case GGML_TYPE_F16:
1013
+ default:
1014
+ return 1;
1015
+ }
1016
+ }
1017
+
1018
+ static size_t ggml_cl_local_size(ggml_type type) {
1019
+ switch (type) {
1020
+ case GGML_TYPE_Q4_0:
1021
+ case GGML_TYPE_Q4_1:
1022
+ case GGML_TYPE_Q5_0:
1023
+ case GGML_TYPE_Q5_1:
1024
+ case GGML_TYPE_Q8_0:
1025
+ return 0;
1026
+ case GGML_TYPE_Q2_K:
1027
+ case GGML_TYPE_Q3_K:
1028
+ return 64;
1029
+ case GGML_TYPE_Q4_K:
1030
+ return 32;
1031
+ case GGML_TYPE_Q5_K:
1032
+ case GGML_TYPE_Q6_K:
1033
+ return 64;
1034
+ case GGML_TYPE_F16:
1035
+ default:
1036
+ return 0;
1037
+ }
1038
+ }
1039
+
1040
+ static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
1041
+ switch (type) {
1042
+ case GGML_TYPE_Q4_0:
1043
+ return &dequantize_mul_mat_vec_q4_0_cl;
1044
+ case GGML_TYPE_Q4_1:
1045
+ return &dequantize_mul_mat_vec_q4_1_cl;
1046
+ case GGML_TYPE_Q5_0:
1047
+ return &dequantize_mul_mat_vec_q5_0_cl;
1048
+ case GGML_TYPE_Q5_1:
1049
+ return &dequantize_mul_mat_vec_q5_1_cl;
1050
+ case GGML_TYPE_Q8_0:
1051
+ return &dequantize_mul_mat_vec_q8_0_cl;
1052
+ case GGML_TYPE_F16:
1053
+ return &convert_mul_mat_vec_f16_cl;
1054
+ case GGML_TYPE_Q2_K:
1055
+ return &dequantize_mul_mat_vec_q2_K_cl;
1056
+ case GGML_TYPE_Q3_K:
1057
+ return &dequantize_mul_mat_vec_q3_K_cl;
1058
+ case GGML_TYPE_Q4_K:
1059
+ return &dequantize_mul_mat_vec_q4_K_cl;
1060
+ case GGML_TYPE_Q5_K:
1061
+ return &dequantize_mul_mat_vec_q5_K_cl;
1062
+ case GGML_TYPE_Q6_K:
1063
+ return &dequantize_mul_mat_vec_q6_K_cl;
1064
+ default:
1065
+ return nullptr;
1066
+ }
1067
+ }
1068
+
1069
+ // buffer pool for cl
1070
+ #define MAX_CL_BUFFERS 256
1071
+
1072
+ struct scoped_spin_lock {
1073
+ std::atomic_flag& lock;
1074
+ scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
1075
+ while (lock.test_and_set(std::memory_order_acquire)) {
1076
+ ; // spin
1077
+ }
1078
+ }
1079
+ ~scoped_spin_lock() {
1080
+ lock.clear(std::memory_order_release);
1081
+ }
1082
+ scoped_spin_lock(const scoped_spin_lock&) = delete;
1083
+ scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
1084
+ };
1085
+
1086
+ struct cl_buffer {
1087
+ cl_mem mem;
1088
+ size_t size = 0;
1089
+ };
1090
+
1091
+ static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
1092
+ static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
1093
+
1094
+ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
1095
+ scoped_spin_lock lock(g_cl_pool_lock);
1096
+ cl_int err;
1097
+
1098
+ int best_i = -1;
1099
+ size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
1100
+ int worst_i = -1;
1101
+ size_t worst_size = 0; //largest unused buffer seen so far
1102
+ for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
1103
+ cl_buffer &b = g_cl_buffer_pool[i];
1104
+ if (b.size > 0 && b.size >= size && b.size < best_size)
1105
+ {
1106
+ best_i = i;
1107
+ best_size = b.size;
1108
+ }
1109
+ if (b.size > 0 && b.size > worst_size)
1110
+ {
1111
+ worst_i = i;
1112
+ worst_size = b.size;
1113
+ }
1114
+ }
1115
+ if(best_i!=-1) //found the smallest buffer that fits our needs
1116
+ {
1117
+ cl_buffer& b = g_cl_buffer_pool[best_i];
1118
+ cl_mem mem = b.mem;
1119
+ *actual_size = b.size;
1120
+ b.size = 0;
1121
+ return mem;
1122
+ }
1123
+ if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
1124
+ {
1125
+ cl_buffer& b = g_cl_buffer_pool[worst_i];
1126
+ cl_mem mem = b.mem;
1127
+ b.size = 0;
1128
+ clReleaseMemObject(mem);
1129
+ }
1130
+ cl_mem mem;
1131
+ CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
1132
+ *actual_size = size;
1133
+ return mem;
1134
+ }
1135
+
1136
+ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
1137
+ scoped_spin_lock lock(g_cl_pool_lock);
1138
+
1139
+ for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
1140
+ cl_buffer& b = g_cl_buffer_pool[i];
1141
+ if (b.size == 0) {
1142
+ b.mem = mem;
1143
+ b.size = size;
1144
+ return;
1145
+ }
1146
+ }
1147
+ fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
1148
+ clReleaseMemObject(mem);
1149
+ }
1150
+
1151
+ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1152
+ if (tensor->backend != GGML_BACKEND_GPU) {
1153
+ return;
1154
+ }
1155
+
1156
+ cl_mem mem = (cl_mem)tensor->data;
1157
+ clReleaseMemObject(mem);
1158
+ }
1159
+
1160
+ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
1161
+ cl_int err;
1162
+ const uint64_t ne0 = src->ne[0];
1163
+ const uint64_t ne1 = src->ne[1];
1164
+ const uint64_t nb0 = src->nb[0];
1165
+ const uint64_t nb1 = src->nb[1];
1166
+ const uint64_t nb2 = src->nb[2];
1167
+ const uint64_t nb3 = src->nb[3];
1168
+ const enum ggml_type type = src->type;
1169
+ const size_t ts = ggml_type_size(type);
1170
+ const size_t bs = ggml_blck_size(type);
1171
+
1172
+ const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1173
+ if (nb0 == ts && nb1 == ts*ne0/bs) {
1174
+ err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
1175
+ return err;
1176
+ }
1177
+ if (nb0 == ts) {
1178
+ const size_t buffer_origin[3] = { offset, 0, 0 };
1179
+ const size_t host_origin[3] = { 0, 0, 0 };
1180
+ const size_t region[3] = { ts*ne0/bs, ne1, 1 };
1181
+ err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
1182
+ return err;
1183
+ }
1184
+ for (uint64_t i1 = 0; i1 < ne1; i1++) {
1185
+ // pretend the row is a matrix with cols=1
1186
+ const size_t buffer_origin[3] = { offset, i1, 0 };
1187
+ const size_t host_origin[3] = { 0, 0, 0 };
1188
+ const size_t region[3] = { ts/bs, ne0, 1 };
1189
+ err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
1190
+ if (err != CL_SUCCESS) {
1191
+ break;
1192
+ }
1193
+ }
1194
+ return err;
1195
+ }
1196
+
1197
+ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1198
+ GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
1199
+ const int64_t ne00 = src0->ne[0];
1200
+ const int64_t ne01 = src0->ne[1];
1201
+ const int64_t ne02 = src0->ne[2];
1202
+ const int64_t ne03 = src0->ne[2];
1203
+ const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
1204
+ const int64_t ne10 = src1->ne[0];
1205
+ const int64_t ne11 = src1->ne[1];
1206
+ const int64_t ne12 = src1->ne[2];
1207
+ const int64_t ne13 = src1->ne[3];
1208
+ const int64_t nb10 = src1->nb[0];
1209
+ const int nb2 = dst->nb[2];
1210
+ const int nb3 = dst->nb[3];
1211
+ size_t x_size;
1212
+ size_t d_size;
1213
+
1214
+ cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1215
+ cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
1216
+ cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1217
+
1218
+
1219
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1220
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1221
+ const int i0 = i03*ne02 + i02;
1222
+
1223
+ cl_event ev;
1224
+
1225
+ // copy src0 to device
1226
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, i0, src0, i03, i02, &ev));
1227
+
1228
+ if (nb10 == sizeof(float)) {
1229
+ // Contiguous, avoid overhead from queueing many kernel runs
1230
+ const int64_t i13 = i03%ne13;
1231
+ const int64_t i12 = i02%ne12;
1232
+ const int i1 = i13*ne12*ne11 + i12*ne11;
1233
+
1234
+ cl_int x_offset = 0;
1235
+ cl_int y_offset = i1*ne10;
1236
+ cl_int d_offset = 0;
1237
+
1238
+ size_t global = ne00 * ne01;
1239
+ cl_int ky = ne10;
1240
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1241
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1242
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1243
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1244
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1245
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1246
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1247
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1248
+ } else {
1249
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
1250
+ const int64_t i13 = i03%ne13;
1251
+ const int64_t i12 = i02%ne12;
1252
+ const int64_t i11 = i01%ne11;
1253
+ const int i1 = i13*ne12*ne11 + i12*ne11 + i11;
1254
+
1255
+ cl_int x_offset = i01*ne00;
1256
+ cl_int y_offset = i1*ne10;
1257
+ cl_int d_offset = i01*ne00;
1258
+
1259
+ // compute
1260
+ size_t global = ne00;
1261
+ cl_int ky = ne10;
1262
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
1263
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
1264
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
1265
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
1266
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
1267
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
1268
+ CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
1269
+ CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
1270
+ }
1271
+ }
1272
+
1273
+ CL_CHECK(clReleaseEvent(ev));
1274
+ CL_CHECK(clFinish(queue));
1275
+
1276
+ // copy dst to host
1277
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1278
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
1279
+ }
1280
+ }
1281
+ ggml_cl_pool_free(d_X, x_size);
1282
+ ggml_cl_pool_free(d_D, d_size);
1283
+ }
1284
+
1285
+ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1286
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
1287
+ ggml_cl_mul_f32(src0, src1, dst);
1288
+ }
1289
+
1290
+ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1291
+ const int64_t ne00 = src0->ne[0];
1292
+ const int64_t ne01 = src0->ne[1];
1293
+ const int64_t ne02 = src0->ne[2];
1294
+ const int64_t ne03 = src0->ne[3];
1295
+
1296
+ const int64_t ne10 = src1->ne[0];
1297
+ const int64_t ne11 = src1->ne[1];
1298
+
1299
+ const int nb2 = dst->nb[2];
1300
+ const int nb3 = dst->nb[3];
1301
+
1302
+ const float alpha = 1.0f;
1303
+ const float beta = 0.0f;
1304
+ const int x_ne = ne01 * ne00;
1305
+ const int y_ne = ne11 * ne10;
1306
+ const int d_ne = ne11 * ne01;
1307
+
1308
+ size_t x_size;
1309
+ size_t y_size;
1310
+ size_t d_size;
1311
+ cl_mem d_X;
1312
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1313
+ d_X = (cl_mem) src0->data;
1314
+ } else {
1315
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1316
+ }
1317
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1318
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1319
+
1320
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1321
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1322
+ // copy data to device
1323
+ if (src0->backend != GGML_BACKEND_GPU) {
1324
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1325
+ }
1326
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1327
+
1328
+ CL_CHECK(clFinish(queue));
1329
+
1330
+ // compute
1331
+ cl_event ev_sgemm;
1332
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1333
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1334
+ ne01, ne11, ne10,
1335
+ alpha,
1336
+ d_X, 0, ne00,
1337
+ d_Y, 0, ne10,
1338
+ beta,
1339
+ d_D, 0, ne01,
1340
+ &queue, &ev_sgemm);
1341
+
1342
+ if (status != clblast::StatusCode::kSuccess) {
1343
+ GGML_ASSERT(false);
1344
+ }
1345
+
1346
+ // copy dst to host
1347
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1348
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
1349
+ }
1350
+ }
1351
+
1352
+ if (src0->backend != GGML_BACKEND_GPU) {
1353
+ ggml_cl_pool_free(d_X, x_size);
1354
+ }
1355
+ ggml_cl_pool_free(d_Y, y_size);
1356
+ ggml_cl_pool_free(d_D, d_size);
1357
+ }
1358
+
1359
+ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
1360
+ GGML_ASSERT(fp16_support);
1361
+
1362
+ const int64_t ne00 = src0->ne[0];
1363
+ const int64_t ne01 = src0->ne[1];
1364
+ const int64_t ne02 = src0->ne[2];
1365
+ const int64_t ne03 = src0->ne[3];
1366
+
1367
+ const int64_t ne10 = src1->ne[0];
1368
+ const int64_t ne11 = src1->ne[1];
1369
+
1370
+ const int nb10 = src1->nb[0];
1371
+ const int nb11 = src1->nb[1];
1372
+ const int nb12 = src1->nb[2];
1373
+ const int nb13 = src1->nb[3];
1374
+
1375
+ const int nb2 = dst->nb[2];
1376
+ const int nb3 = dst->nb[3];
1377
+
1378
+ const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
1379
+ const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
1380
+ const int x_ne = ne01 * ne00;
1381
+ const int y_ne = ne11 * ne10;
1382
+ const int d_ne = ne11 * ne01;
1383
+
1384
+ size_t x_size;
1385
+ size_t y_size;
1386
+ size_t d_size;
1387
+ cl_mem d_X;
1388
+ if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1389
+ d_X = (cl_mem) src0->data;
1390
+ } else {
1391
+ d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1392
+ }
1393
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
1394
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
1395
+
1396
+ bool src1_cont_rows = nb10 == sizeof(float);
1397
+ bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
1398
+
1399
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1400
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1401
+ // copy src0 to device
1402
+ if (src0->backend != GGML_BACKEND_GPU) {
1403
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1404
+ }
1405
+
1406
+ // convert src1 to fp16
1407
+ // TODO: use multiple threads
1408
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
1409
+ char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
1410
+ if (src1_cont_rows) {
1411
+ if (src1_cont_cols) {
1412
+ ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
1413
+ }
1414
+ else {
1415
+ for (int64_t i01 = 0; i01 < ne11; i01++) {
1416
+ ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
1417
+ }
1418
+ }
1419
+ }
1420
+ else {
1421
+ for (int64_t i01 = 0; i01 < ne11; i01++) {
1422
+ for (int64_t i00 = 0; i00 < ne10; i00++) {
1423
+ // very slow due to no inlining
1424
+ tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
1425
+ }
1426
+ }
1427
+ }
1428
+
1429
+ // copy src1 to device
1430
+ CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
1431
+
1432
+ CL_CHECK(clFinish(queue));
1433
+
1434
+ // compute
1435
+ cl_event ev_sgemm;
1436
+ clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
1437
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1438
+ ne01, ne11, ne10,
1439
+ alpha,
1440
+ d_X, 0, ne00,
1441
+ d_Y, 0, ne10,
1442
+ beta,
1443
+ d_D, 0, ne01,
1444
+ &queue, &ev_sgemm);
1445
+
1446
+ if (status != clblast::StatusCode::kSuccess) {
1447
+ GGML_ASSERT(false);
1448
+ }
1449
+
1450
+ // copy dst to host, then convert to float
1451
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
1452
+
1453
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1454
+
1455
+ ggml_fp16_to_fp32_row(tmp, d, d_ne);
1456
+ }
1457
+ }
1458
+
1459
+ if (src0->backend != GGML_BACKEND_GPU) {
1460
+ ggml_cl_pool_free(d_X, x_size);
1461
+ }
1462
+ ggml_cl_pool_free(d_Y, y_size);
1463
+ ggml_cl_pool_free(d_D, d_size);
1464
+ }
1465
+
1466
+ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1467
+ const int64_t ne00 = src0->ne[0];
1468
+ const int64_t ne01 = src0->ne[1];
1469
+ const int64_t ne02 = src0->ne[2];
1470
+ const int64_t ne03 = src0->ne[3];
1471
+
1472
+ const int64_t ne10 = src1->ne[0];
1473
+ const int64_t ne11 = src1->ne[1];
1474
+
1475
+ const int nb2 = dst->nb[2];
1476
+ const int nb3 = dst->nb[3];
1477
+ const ggml_type type = src0->type;
1478
+ const bool mul_mat_vec = ne11 == 1;
1479
+
1480
+ const float alpha = 1.0f;
1481
+ const float beta = 0.0f;
1482
+ const int x_ne = ne01 * ne00;
1483
+ const int y_ne = ne11 * ne10;
1484
+ const int d_ne = ne11 * ne01;
1485
+ const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
1486
+
1487
+ size_t x_size;
1488
+ size_t y_size;
1489
+ size_t d_size;
1490
+ size_t q_size;
1491
+ cl_mem d_X;
1492
+ if (!mul_mat_vec) {
1493
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1494
+ }
1495
+ cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1496
+ cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
1497
+ cl_mem d_Q;
1498
+ if (src0->backend == GGML_BACKEND_CPU) {
1499
+ d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
1500
+ }
1501
+
1502
+ cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
1503
+ cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
1504
+ GGML_ASSERT(to_fp32_cl != nullptr);
1505
+
1506
+ const size_t global_denom = ggml_cl_global_denom(type);
1507
+ const size_t local = ggml_cl_local_size(type);
1508
+
1509
+ size_t ev_idx = 0;
1510
+ std::vector<cl_event> events;
1511
+
1512
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
1513
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
1514
+ // copy src0 to device if necessary
1515
+ if (src0->backend == GGML_BACKEND_CPU) {
1516
+ events.emplace_back();
1517
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1518
+ } else if (src0->backend == GGML_BACKEND_GPU) {
1519
+ d_Q = (cl_mem) src0->data;
1520
+ } else {
1521
+ GGML_ASSERT(false);
1522
+ }
1523
+ if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1524
+ // copy src1 to device
1525
+ events.emplace_back();
1526
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
1527
+
1528
+ // compute
1529
+ const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
1530
+ const size_t local = CL_DMMV_BLOCK_SIZE;
1531
+ const cl_int ncols = ne00;
1532
+ events.emplace_back();
1533
+ CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
1534
+ CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
1535
+ CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
1536
+ CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
1537
+ CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
1538
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
1539
+ } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1540
+ // convert src0 to fp32 on device
1541
+ const size_t global = x_ne / global_denom;
1542
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
1543
+ CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1544
+ CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1545
+
1546
+ // copy src1 to device
1547
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1548
+
1549
+ events.emplace_back();
1550
+
1551
+ // wait for conversion
1552
+ CL_CHECK(clFinish(queue));
1553
+
1554
+ // compute
1555
+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
1556
+ clblast::Transpose::kYes, clblast::Transpose::kNo,
1557
+ ne01, ne11, ne10,
1558
+ alpha,
1559
+ d_X, 0, ne00,
1560
+ d_Y, 0, ne10,
1561
+ beta,
1562
+ d_D, 0, ne01,
1563
+ &queue, events.data() + ev_idx++);
1564
+
1565
+ if (status != clblast::StatusCode::kSuccess) {
1566
+ GGML_ASSERT(false);
1567
+ }
1568
+ }
1569
+
1570
+ // copy dst to host
1571
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1572
+ CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
1573
+ for (auto *event : events) {
1574
+ clReleaseEvent(event);
1575
+ }
1576
+
1577
+ ev_idx = 0;
1578
+ events.clear();
1579
+ }
1580
+ }
1581
+
1582
+ if (!mul_mat_vec) {
1583
+ ggml_cl_pool_free(d_X, x_size);
1584
+ }
1585
+ ggml_cl_pool_free(d_Y, y_size);
1586
+ ggml_cl_pool_free(d_D, d_size);
1587
+ if (src0->backend == GGML_BACKEND_CPU) {
1588
+ ggml_cl_pool_free(d_Q, q_size);
1589
+ }
1590
+ }
1591
+
1592
+
1593
+ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1594
+ const int64_t ne10 = src1->ne[0];
1595
+
1596
+ const int64_t ne0 = dst->ne[0];
1597
+ const int64_t ne1 = dst->ne[1];
1598
+
1599
+ // TODO: find the optimal values for these
1600
+ if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
1601
+ src1->type == GGML_TYPE_F32 &&
1602
+ dst->type == GGML_TYPE_F32 &&
1603
+ ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
1604
+ return true;
1605
+ }
1606
+
1607
+ return false;
1608
+ }
1609
+
1610
+ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
1611
+ // If device doesn't support FP16
1612
+ if (!fp16_support) {
1613
+ return false;
1614
+ }
1615
+
1616
+ size_t src0_sz = ggml_nbytes(src0);
1617
+ size_t src1_sz = ggml_nbytes(src1);
1618
+
1619
+ // mul_mat_q: src0 is converted to fp32 on device
1620
+ size_t mul_mat_q_transfer = src0_sz + src1_sz;
1621
+
1622
+ // mul_mat_f16: src1 is converted to fp16 on cpu
1623
+ size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
1624
+
1625
+ // choose the smaller one to transfer to the device
1626
+ // TODO: this is not always the best choice due to the overhead of converting to fp16
1627
+ return mul_mat_f16_transfer < mul_mat_q_transfer;
1628
+ }
1629
+
1630
+ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
1631
+ GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
1632
+
1633
+ if (src0->type == GGML_TYPE_F32) {
1634
+ ggml_cl_mul_mat_f32(src0, src1, dst);
1635
+ }
1636
+ else if (src0->type == GGML_TYPE_F16) {
1637
+ if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1638
+ ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
1639
+ }
1640
+ else {
1641
+ ggml_cl_mul_mat_q_f32(src0, src1, dst);
1642
+ }
1643
+ }
1644
+ else if (ggml_is_quantized(src0->type)) {
1645
+ ggml_cl_mul_mat_q_f32(src0, src1, dst);
1646
+ }
1647
+ else {
1648
+ GGML_ASSERT(false);
1649
+ }
1650
+ }
1651
+
1652
+ size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
1653
+ if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
1654
+ return ggml_nelements(src1) * sizeof(ggml_fp16_t);
1655
+ }
1656
+ return 0;
1657
+ }
1658
+
1659
+ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1660
+ const int64_t ne0 = tensor->ne[0];
1661
+ const int64_t ne1 = tensor->ne[1];
1662
+ const int64_t ne2 = tensor->ne[2];
1663
+ const int64_t ne3 = tensor->ne[3];
1664
+
1665
+ const ggml_type type = tensor->type;
1666
+ const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1667
+
1668
+ size_t q_size;
1669
+ cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
1670
+
1671
+ tensor->data = data;
1672
+ // copy tensor to device
1673
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
1674
+ for (int64_t i2 = 0; i2 < ne2; i2++) {
1675
+ int i = i3*ne2 + i2;
1676
+ CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1677
+ }
1678
+ }
1679
+
1680
+ CL_CHECK(clFinish(queue));
1681
+
1682
+ tensor->data = dst;
1683
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1684
+ }