ggerganov commited on
Commit
f50d3b3
·
unverified ·
1 Parent(s): 6eca3b7

ggml : sync ggml (clBLAST + tensor names)

Browse files
Files changed (7) hide show
  1. extra/sync-ggml.sh +3 -1
  2. ggml-cuda.cu +389 -38
  3. ggml-cuda.h +5 -40
  4. ggml-opencl.c +398 -0
  5. ggml-opencl.h +24 -0
  6. ggml.c +120 -189
  7. ggml.h +19 -1
extra/sync-ggml.sh CHANGED
@@ -1,8 +1,10 @@
1
  #!/bin/bash
2
 
3
  cp -rpv ../ggml/src/ggml.c ./ggml.c
4
- cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
5
  cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
 
 
 
6
  cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
7
  cp -rpv ../ggml/examples/common.h ./examples/common.h
8
  cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp
 
1
  #!/bin/bash
2
 
3
  cp -rpv ../ggml/src/ggml.c ./ggml.c
 
4
  cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
5
+ cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
6
+ cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
7
+ cp -rpv ../ggml/src/ggml-opencl.c ./ggml-opencl.c
8
  cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
9
  cp -rpv ../ggml/examples/common.h ./examples/common.h
10
  cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp
ggml-cuda.cu CHANGED
@@ -1,11 +1,38 @@
 
 
1
  #include <stdint.h>
2
  #include <stdio.h>
3
- #include <cuda_fp16.h>
4
  #include <atomic>
5
- #include "ggml-cuda.h"
6
 
7
- typedef uint16_t ggml_fp16_t;
8
- static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  #define QK4_0 32
11
  typedef struct {
@@ -24,14 +51,14 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b
24
 
25
  #define QK4_2 16
26
  typedef struct {
27
- __half d; // delta
28
  uint8_t qs[QK4_2 / 2]; // nibbles / quants
29
  } block_q4_2;
30
  static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
31
 
32
  #define QK5_0 32
33
  typedef struct {
34
- __half d; // delta
35
  uint8_t qh[4]; // 5-th bit of quants
36
  uint8_t qs[QK5_0 / 2]; // nibbles / quants
37
  } block_q5_0;
@@ -39,9 +66,9 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
39
 
40
  #define QK5_1 32
41
  typedef struct {
42
- __half d; // delta
43
- __half m; // min
44
- uint32_t qh; // 5-th bit of quants
45
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
46
  } block_q5_1;
47
  static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
@@ -162,7 +189,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
162
 
163
  const uint8_t * pp = x[i].qs;
164
 
165
- const uint32_t qh = x[i].qh;
 
166
 
167
  for (int l = 0; l < QK5_1; l += 2) {
168
  const uint8_t vi = pp[l/2];
@@ -197,37 +225,50 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
197
  }
198
  }
199
 
200
- void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
201
  const int nb = k / QK4_0;
202
  dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
203
  }
204
 
205
- void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
206
  const int nb = k / QK4_1;
207
  dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
208
  }
209
 
210
- void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
211
  const int nb = k / QK4_2;
212
  dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
213
  }
214
 
215
- void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
216
  const int nb = k / QK5_0;
217
  dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
218
  }
219
 
220
- void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
221
  const int nb = k / QK5_1;
222
  dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
223
  }
224
 
225
- void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
226
  const int nb = k / QK8_0;
227
  dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
228
  }
229
 
230
- dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  switch (type) {
232
  case GGML_TYPE_Q4_0:
233
  return dequantize_row_q4_0_cuda;
@@ -241,6 +282,8 @@ dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
241
  return dequantize_row_q5_1_cuda;
242
  case GGML_TYPE_Q8_0:
243
  return dequantize_row_q8_0_cuda;
 
 
244
  default:
245
  return nullptr;
246
  }
@@ -271,7 +314,7 @@ struct cuda_buffer {
271
  static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
272
  static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
273
 
274
- void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
275
  scoped_spin_lock lock(g_cuda_pool_lock);
276
 
277
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -290,7 +333,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
290
  return ptr;
291
  }
292
 
293
- void ggml_cuda_pool_free(void * ptr, size_t size) {
294
  scoped_spin_lock lock(g_cuda_pool_lock);
295
 
296
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -305,28 +348,55 @@ void ggml_cuda_pool_free(void * ptr, size_t size) {
305
  CUDA_CHECK(cudaFree(ptr));
306
  }
307
 
308
- cublasHandle_t g_cublasH = nullptr;
309
- cudaStream_t g_cudaStream = nullptr;
310
- cudaStream_t g_cudaStream2 = nullptr;
311
- cudaEvent_t g_cudaEvent = nullptr;
 
 
312
 
313
  void ggml_init_cublas() {
314
  if (g_cublasH == nullptr) {
315
- // create cublas handle, bind a stream
316
- CUBLAS_CHECK(cublasCreate(&g_cublasH));
317
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
318
- CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
 
 
 
 
 
319
 
320
- // create additional stream and event for synchronization
321
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
322
- CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
323
 
324
  // configure logging to stdout
325
- // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
326
  }
327
  }
328
 
329
- cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  const uint64_t ne0 = src->ne[0];
331
  const uint64_t ne1 = src->ne[1];
332
  const uint64_t nb0 = src->nb[0];
@@ -354,12 +424,293 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
354
  }
355
  }
356
 
357
- void * ggml_cuda_host_malloc(size_t size) {
358
- void * ptr;
359
- CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
360
- return ptr;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  }
362
 
363
- void ggml_cuda_host_free(void * ptr) {
364
- CUDA_CHECK(cudaFreeHost(ptr));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  }
 
1
+ #include <cstddef>
2
+ #include <cstdint>
3
  #include <stdint.h>
4
  #include <stdio.h>
 
5
  #include <atomic>
 
6
 
7
+ #include <cuda_runtime.h>
8
+ #include <cublas_v2.h>
9
+ #include <cuda_fp16.h>
10
+
11
+ #include "ggml-cuda.h"
12
+ #include "ggml.h"
13
+
14
+ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
15
+
16
+ #define CUDA_CHECK(err) \
17
+ do { \
18
+ cudaError_t err_ = (err); \
19
+ if (err_ != cudaSuccess) { \
20
+ fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
21
+ cudaGetErrorString(err_)); \
22
+ exit(1); \
23
+ } \
24
+ } while (0)
25
+
26
+ #define CUBLAS_CHECK(err) \
27
+ do { \
28
+ cublasStatus_t err_ = (err); \
29
+ if (err_ != CUBLAS_STATUS_SUCCESS) { \
30
+ fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
31
+ exit(1); \
32
+ } \
33
+ } while (0)
34
+
35
+ typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
36
 
37
  #define QK4_0 32
38
  typedef struct {
 
51
 
52
  #define QK4_2 16
53
  typedef struct {
54
+ half d; // delta
55
  uint8_t qs[QK4_2 / 2]; // nibbles / quants
56
  } block_q4_2;
57
  static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
58
 
59
  #define QK5_0 32
60
  typedef struct {
61
+ half d; // delta
62
  uint8_t qh[4]; // 5-th bit of quants
63
  uint8_t qs[QK5_0 / 2]; // nibbles / quants
64
  } block_q5_0;
 
66
 
67
  #define QK5_1 32
68
  typedef struct {
69
+ half d; // delta
70
+ half m; // min
71
+ uint8_t qh[4]; // 5-th bit of quants
72
  uint8_t qs[QK5_1 / 2]; // nibbles / quants
73
  } block_q5_1;
74
  static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 
189
 
190
  const uint8_t * pp = x[i].qs;
191
 
192
+ uint32_t qh;
193
+ memcpy(&qh, x[i].qh, sizeof(qh));
194
 
195
  for (int l = 0; l < QK5_1; l += 2) {
196
  const uint8_t vi = pp[l/2];
 
225
  }
226
  }
227
 
228
+ static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
229
  const int nb = k / QK4_0;
230
  dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
231
  }
232
 
233
+ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
234
  const int nb = k / QK4_1;
235
  dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
236
  }
237
 
238
+ static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
239
  const int nb = k / QK4_2;
240
  dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
241
  }
242
 
243
+ static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
244
  const int nb = k / QK5_0;
245
  dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
246
  }
247
 
248
+ static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
249
  const int nb = k / QK5_1;
250
  dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
251
  }
252
 
253
+ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
254
  const int nb = k / QK8_0;
255
  dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
256
  }
257
 
258
+ // TODO: optimize
259
+ static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
260
+ const half * x = (const half *) vx;
261
+
262
+ const int i = blockIdx.x;
263
+
264
+ y[i] = __half2float(x[i]);
265
+ }
266
+
267
+ static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
268
+ convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
269
+ }
270
+
271
+ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
272
  switch (type) {
273
  case GGML_TYPE_Q4_0:
274
  return dequantize_row_q4_0_cuda;
 
282
  return dequantize_row_q5_1_cuda;
283
  case GGML_TYPE_Q8_0:
284
  return dequantize_row_q8_0_cuda;
285
+ case GGML_TYPE_F16:
286
+ return convert_fp16_to_fp32_cuda;
287
  default:
288
  return nullptr;
289
  }
 
314
  static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
315
  static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
316
 
317
+ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
318
  scoped_spin_lock lock(g_cuda_pool_lock);
319
 
320
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
 
333
  return ptr;
334
  }
335
 
336
+ static void ggml_cuda_pool_free(void * ptr, size_t size) {
337
  scoped_spin_lock lock(g_cuda_pool_lock);
338
 
339
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
 
348
  CUDA_CHECK(cudaFree(ptr));
349
  }
350
 
351
+ #define GGML_CUDA_MAX_STREAMS 8
352
+ #define GGML_CUDA_MAX_EVENTS 64
353
+ static cublasHandle_t g_cublasH = nullptr;
354
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
355
+ static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr };
356
+ static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr };
357
 
358
  void ggml_init_cublas() {
359
  if (g_cublasH == nullptr) {
360
+ // create streams
361
+ for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
362
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking));
363
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking));
364
+ }
365
+ // create events
366
+ for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
367
+ CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming));
368
+ }
369
 
370
+ // create cublas handle
371
+ CUBLAS_CHECK(cublasCreate(&g_cublasH));
372
+ CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
373
 
374
  // configure logging to stdout
375
+ // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
376
  }
377
  }
378
 
379
+ void * ggml_cuda_host_malloc(size_t size) {
380
+ if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
381
+ return nullptr;
382
+ }
383
+
384
+ void * ptr = nullptr;
385
+ cudaError_t err = cudaMallocHost((void **) &ptr, size);
386
+ if (err != cudaSuccess) {
387
+ fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
388
+ size/1024.0/1024.0, cudaGetErrorString(err));
389
+ return nullptr;
390
+ }
391
+
392
+ return ptr;
393
+ }
394
+
395
+ void ggml_cuda_host_free(void * ptr) {
396
+ CUDA_CHECK(cudaFreeHost(ptr));
397
+ }
398
+
399
+ static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
400
  const uint64_t ne0 = src->ne[0];
401
  const uint64_t ne1 = src->ne[1];
402
  const uint64_t nb0 = src->nb[0];
 
424
  }
425
  }
426
 
427
+ static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
428
+ const int64_t ne00 = src0->ne[0];
429
+ const int64_t ne01 = src0->ne[1];
430
+ const int64_t ne02 = src0->ne[2];
431
+ const int64_t ne03 = src0->ne[3];
432
+
433
+ const int64_t ne10 = src1->ne[0];
434
+ const int64_t ne11 = src1->ne[1];
435
+
436
+ const int nb2 = dst->nb[2];
437
+ const int nb3 = dst->nb[3];
438
+
439
+ const float alpha = 1.0f;
440
+ const float beta = 0.0f;
441
+ const int x_ne = ne01 * ne00;
442
+ const int y_ne = ne11 * ne10;
443
+ const int d_ne = ne11 * ne01;
444
+ const int n_mm = ne03 * ne02;
445
+
446
+ size_t x_size, y_size, d_size;
447
+ float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
448
+ float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
449
+ float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
450
+
451
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
452
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
453
+ int i = i03*ne02 + i02;
454
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
455
+
456
+ float * c_X = d_X + i * x_ne;
457
+ float * c_Y = d_Y + i * y_ne;
458
+ float * c_D = d_D + i * d_ne;
459
+
460
+ // copy data to device
461
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
462
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
463
+
464
+ // compute
465
+ CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
466
+ CUBLAS_CHECK(
467
+ cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
468
+ ne01, ne11, ne10,
469
+ &alpha, c_X, ne00,
470
+ c_Y, ne10,
471
+ &beta, c_D, ne01));
472
+
473
+ // copy dst to host
474
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
475
+ CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
476
+ }
477
+ }
478
+
479
+ CUDA_CHECK(cudaDeviceSynchronize());
480
+ ggml_cuda_pool_free(d_X, x_size);
481
+ ggml_cuda_pool_free(d_Y, y_size);
482
+ ggml_cuda_pool_free(d_D, d_size);
483
  }
484
 
485
+ static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
486
+ const int64_t ne00 = src0->ne[0];
487
+ const int64_t ne01 = src0->ne[1];
488
+ const int64_t ne02 = src0->ne[2];
489
+ const int64_t ne03 = src0->ne[3];
490
+
491
+ const int64_t ne10 = src1->ne[0];
492
+ const int64_t ne11 = src1->ne[1];
493
+
494
+ const int nb10 = src1->nb[0];
495
+ const int nb11 = src1->nb[1];
496
+ const int nb12 = src1->nb[2];
497
+ const int nb13 = src1->nb[3];
498
+
499
+ const int nb2 = dst->nb[2];
500
+ const int nb3 = dst->nb[3];
501
+
502
+ const float alpha = 1.0f;
503
+ const float beta = 0.0f;
504
+ const int x_ne = ne01 * ne00;
505
+ const int y_ne = ne11 * ne10;
506
+ const int d_ne = ne11 * ne01;
507
+ const int n_mm = ne03 * ne02;
508
+
509
+ size_t x_size, y_size, d_size;
510
+ half * d_X = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
511
+ half * d_Y = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
512
+ float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
513
+
514
+ bool src1_cont_rows = nb10 == sizeof(float);
515
+ bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
516
+
517
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
518
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
519
+ int i = i03*ne02 + i02;
520
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
521
+
522
+ half * c_X = d_X + i * x_ne;
523
+ half * c_Y = d_Y + i * y_ne;
524
+ float * c_D = d_D + i * d_ne;
525
+
526
+ // copy src0 to device
527
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
528
+
529
+ // convert src1 to fp16
530
+ // TODO: use multiple threads
531
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
532
+ char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
533
+ if (src1_cont_rows) {
534
+ if (src1_cont_cols) {
535
+ ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
536
+ }
537
+ else {
538
+ for (int64_t i01 = 0; i01 < ne11; i01++) {
539
+ ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
540
+ }
541
+ }
542
+ }
543
+ else {
544
+ for (int64_t i01 = 0; i01 < ne11; i01++) {
545
+ for (int64_t i00 = 0; i00 < ne10; i00++) {
546
+ // very slow due to no inlining
547
+ tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
548
+ }
549
+ }
550
+ }
551
+
552
+ // copy src1 to device
553
+ CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
554
+
555
+ // compute
556
+ CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
557
+ CUBLAS_CHECK(
558
+ cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
559
+ ne01, ne11, ne10,
560
+ &alpha, c_X, CUDA_R_16F, ne00,
561
+ c_Y, CUDA_R_16F, ne10,
562
+ &beta, c_D, CUDA_R_32F, ne01,
563
+ CUBLAS_COMPUTE_32F_FAST_16F,
564
+ CUBLAS_GEMM_DEFAULT));
565
+
566
+ // copy dst to host
567
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
568
+ CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
569
+ }
570
+ }
571
+
572
+ CUDA_CHECK(cudaDeviceSynchronize());
573
+ ggml_cuda_pool_free(d_X, x_size);
574
+ ggml_cuda_pool_free(d_Y, y_size);
575
+ ggml_cuda_pool_free(d_D, d_size);
576
+ }
577
+
578
+ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
579
+ const int64_t ne00 = src0->ne[0];
580
+ const int64_t ne01 = src0->ne[1];
581
+ const int64_t ne02 = src0->ne[2];
582
+ const int64_t ne03 = src0->ne[3];
583
+
584
+ const int64_t ne10 = src1->ne[0];
585
+ const int64_t ne11 = src1->ne[1];
586
+
587
+ const int nb2 = dst->nb[2];
588
+ const int nb3 = dst->nb[3];
589
+ const ggml_type type = src0->type;
590
+
591
+ const float alpha = 1.0f;
592
+ const float beta = 0.0f;
593
+ const int x_ne = ne01 * ne00;
594
+ const int y_ne = ne11 * ne10;
595
+ const int d_ne = ne11 * ne01;
596
+ const int n_mm = ne03 * ne02;
597
+ const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
598
+
599
+ size_t x_size, y_size, d_size, q_size;
600
+ float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
601
+ float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
602
+ float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
603
+ char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
604
+
605
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
606
+ GGML_ASSERT(to_fp32_cuda != nullptr);
607
+
608
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
609
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
610
+ int i = i03*ne02 + i02;
611
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
612
+ cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
613
+ cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
614
+
615
+ float * c_X = d_X + i * x_ne;
616
+ float * c_Y = d_Y + i * y_ne;
617
+ float * c_D = d_D + i * d_ne;
618
+ char * c_Q = d_Q + i * q_sz;
619
+
620
+ // copy src0 and convert to fp32 on device
621
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
622
+ to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
623
+ CUDA_CHECK(cudaGetLastError());
624
+ CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
625
+
626
+ // copy src1 to device
627
+ CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
628
+
629
+ // wait for conversion
630
+ CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
631
+
632
+ // compute
633
+ CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
634
+ CUBLAS_CHECK(
635
+ cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
636
+ ne01, ne11, ne10,
637
+ &alpha, c_X, ne00,
638
+ c_Y, ne10,
639
+ &beta, c_D, ne01));
640
+
641
+ // copy dst to host
642
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
643
+ CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
644
+ }
645
+ }
646
+
647
+ CUDA_CHECK(cudaDeviceSynchronize());
648
+ ggml_cuda_pool_free(d_X, x_size);
649
+ ggml_cuda_pool_free(d_Y, y_size);
650
+ ggml_cuda_pool_free(d_D, d_size);
651
+ ggml_cuda_pool_free(d_Q, q_size);
652
+ }
653
+
654
+ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
655
+ const int64_t ne10 = src1->ne[0];
656
+
657
+ const int64_t ne0 = dst->ne[0];
658
+ const int64_t ne1 = dst->ne[1];
659
+
660
+ // TODO: find the optimal values for these
661
+ if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
662
+ src1->type == GGML_TYPE_F32 &&
663
+ dst->type == GGML_TYPE_F32 &&
664
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
665
+
666
+ return true;
667
+ }
668
+
669
+ return false;
670
+ }
671
+
672
+ bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
673
+ size_t src0_sz = ggml_nbytes(src0);
674
+ size_t src1_sz = ggml_nbytes(src1);
675
+
676
+ // mul_mat_q: src0 is converted to fp32 on device
677
+ size_t mul_mat_q_transfer = src0_sz + src1_sz;
678
+
679
+ // mul_mat_f16: src1 is converted to fp16 on cpu
680
+ size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1);
681
+
682
+ // choose the smaller one to transfer to the device
683
+ // TODO: this is not always the best choice due to the overhead of converting to fp16
684
+ return mul_mat_f16_transfer < mul_mat_q_transfer;
685
+ }
686
+
687
+ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
688
+ GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst));
689
+
690
+ if (src0->type == GGML_TYPE_F32) {
691
+ ggml_cuda_mul_mat_f32(src0, src1, dst);
692
+ }
693
+ else if (src0->type == GGML_TYPE_F16) {
694
+ if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
695
+ ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
696
+ }
697
+ else {
698
+ ggml_cuda_mul_mat_q_f32(src0, src1, dst);
699
+ }
700
+ }
701
+ else if (ggml_is_quantized(src0->type)) {
702
+ ggml_cuda_mul_mat_q_f32(src0, src1, dst);
703
+ }
704
+ else {
705
+ GGML_ASSERT(false);
706
+ }
707
+ }
708
+
709
+ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
710
+ if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
711
+ return ggml_nelements(src1) * sizeof(ggml_fp16_t);
712
+ }
713
+ else {
714
+ return 0;
715
+ }
716
  }
ggml-cuda.h CHANGED
@@ -1,54 +1,19 @@
1
- #include <cublas_v2.h>
2
- #include <cuda_runtime.h>
3
  #include "ggml.h"
4
 
5
  #ifdef __cplusplus
6
  extern "C" {
7
  #endif
8
 
9
- #define CUDA_CHECK(err) \
10
- do { \
11
- cudaError_t err_ = (err); \
12
- if (err_ != cudaSuccess) { \
13
- fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
14
- cudaGetErrorString(err_)); \
15
- exit(1); \
16
- } \
17
- } while (0)
18
-
19
- #define CUBLAS_CHECK(err) \
20
- do { \
21
- cublasStatus_t err_ = (err); \
22
- if (err_ != CUBLAS_STATUS_SUCCESS) { \
23
- fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
24
- exit(1); \
25
- } \
26
- } while (0)
27
 
28
- extern cublasHandle_t g_cublasH;
29
- extern cudaStream_t g_cudaStream;
30
- extern cudaStream_t g_cudaStream2;
31
- extern cudaEvent_t g_cudaEvent;
32
 
33
- void ggml_init_cublas(void);
34
  void * ggml_cuda_host_malloc(size_t size);
35
  void ggml_cuda_host_free(void * ptr);
36
 
37
- void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
38
- void ggml_cuda_pool_free(void * ptr, size_t size);
39
-
40
- void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
41
- void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
42
- void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
43
- void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
44
- void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
45
- void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
46
-
47
- cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
48
-
49
- typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
50
- dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
51
-
52
  #ifdef __cplusplus
53
  }
54
  #endif
 
 
 
1
  #include "ggml.h"
2
 
3
  #ifdef __cplusplus
4
  extern "C" {
5
  #endif
6
 
7
+ void ggml_init_cublas(void);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
10
+ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
11
+ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
12
 
13
+ // TODO: export these with GGML_API
14
  void * ggml_cuda_host_malloc(size_t size);
15
  void ggml_cuda_host_free(void * ptr);
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  #ifdef __cplusplus
18
  }
19
  #endif
ggml-opencl.c ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-opencl.h"
2
+
3
+ #define CL_TARGET_OPENCL_VERSION 110
4
+ #include <clblast_c.h>
5
+
6
+ #include <stdlib.h>
7
+ #include <stdio.h>
8
+ #include <string.h>
9
+
10
+ #include "ggml.h"
11
+
12
+ #define MULTILINE_QUOTE(...) #__VA_ARGS__
13
+ const char * clblast_dequant = MULTILINE_QUOTE(
14
+
15
+ struct block_q4_0
16
+ {
17
+ float d;
18
+ uchar qs[16];
19
+ };
20
+
21
+ __kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
22
+ const uint i = get_global_id(0) / 32;
23
+ const uint l = get_local_id(0);
24
+
25
+ const float d = blocks[i].d;
26
+
27
+ const uchar vi = blocks[i].qs[l];
28
+
29
+ const uint index = i*32 + l*2;
30
+ result[index + 0] = ((vi & 0xf) - 8)*d;
31
+ result[index + 1] = ((vi >> 4) - 8)*d;
32
+ }
33
+
34
+ struct block_q4_1
35
+ {
36
+ float d;
37
+ float m;
38
+ uchar qs[16];
39
+ };
40
+
41
+ __kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
42
+ const uint i = get_global_id(0) / 32;
43
+ const uint l = get_local_id(0);
44
+
45
+ const float d = blocks[i].d;
46
+ const float m = blocks[i].m;
47
+
48
+ const uchar vi = blocks[i].qs[l];
49
+
50
+ const uint index = i*32 + l*2;
51
+ result[index + 0] = (vi & 0xf) * d + m;
52
+ result[index + 1] = (vi >> 4) * d + m;
53
+ }
54
+
55
+ struct block_q4_2
56
+ {
57
+ ushort d;
58
+ uchar qs[8];
59
+ };
60
+
61
+ __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
62
+ const uint i = get_global_id(0) / 16;
63
+ const uint l = get_local_id(0);
64
+
65
+ const float d = vload_half(0, (__global half*) &blocks[i].d);
66
+
67
+ const uchar vi = blocks[i].qs[l];
68
+
69
+ const uint index = i*16 + l*2;
70
+ result[index + 0] = ((vi & 0xf) - 8)*d;
71
+ result[index + 1] = ((vi >> 4) - 8)*d;
72
+ }
73
+
74
+
75
+ struct block_q5_0
76
+ {
77
+ float d;
78
+ uint qh;
79
+ uchar qs[16];
80
+ };
81
+
82
+ __kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
83
+ const uint i = get_global_id(0) / 32;
84
+ const uint l = get_local_id(0);
85
+
86
+ const float d = blocks[i].d;
87
+
88
+ const uchar vi = blocks[i].qs[l];
89
+
90
+ const uint l2 = l * 2;
91
+
92
+ const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
93
+ const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
94
+
95
+ const uint index = i*32 + l2;
96
+ result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
97
+ result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
98
+ }
99
+
100
+ struct block_q5_1
101
+ {
102
+ ushort d;
103
+ ushort m;
104
+ uint qh;
105
+ uchar qs[16];
106
+ };
107
+
108
+ __kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
109
+ const uint i = get_global_id(0) / 32;
110
+ const uint l = get_local_id(0);
111
+
112
+ const float d = vload_half(0, (__global half*) &blocks[i].d);
113
+ const float m = vload_half(0, (__global half*) &blocks[i].m);
114
+
115
+ const uchar vi = blocks[i].qs[l];
116
+
117
+ const uint l2 = l * 2;
118
+
119
+ const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
120
+ const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
121
+
122
+ const uint index = i*32 + l2;
123
+ result[index + 0] = ((vi & 0xf) | vh0)*d + m;
124
+ result[index + 1] = ((vi >> 4) | vh1)*d + m;
125
+ }
126
+
127
+ struct block_q8_0
128
+ {
129
+ float d;
130
+ char qs[32];
131
+ };
132
+
133
+ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
134
+ const uint i = get_global_id(0) / 32;
135
+ const uint l = get_local_id(0);
136
+
137
+ result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
138
+ }
139
+
140
+ );
141
+
142
+ #define CL_CHECK(err, name) \
143
+ do { \
144
+ cl_int err_ = (err); \
145
+ if (err_ != CL_SUCCESS) { \
146
+ fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
147
+ exit(1); \
148
+ } \
149
+ } while (0)
150
+
151
+ #define QK5_0 32
152
+ typedef struct {
153
+ ggml_fp16_t d; // delta
154
+ uint8_t qh[4]; // 5-th bit of quants
155
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
156
+ } block_q5_0;
157
+
158
+
159
+ typedef struct {
160
+ float d; // delta
161
+ uint32_t qh; // 5-th bit of quants
162
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
163
+ } cl_block_q5_0;
164
+
165
+ static cl_platform_id platform;
166
+ static cl_device_id device;
167
+ static cl_context context;
168
+ static cl_command_queue queue;
169
+ static cl_program program;
170
+ static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
171
+ static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
172
+ static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
173
+
174
+ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
175
+ cl_program p;
176
+ char *program_log;
177
+ size_t program_size, log_size;
178
+ int err;
179
+
180
+ program_size = strlen(program_buffer);
181
+
182
+ p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
183
+ if(err < 0) {
184
+ fprintf(stderr, "OpenCL error creating program");
185
+ exit(1);
186
+ }
187
+
188
+ err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
189
+ if(err < 0) {
190
+
191
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
192
+ program_log = (char*) malloc(log_size + 1);
193
+ program_log[log_size] = '\0';
194
+ clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
195
+ printf("%s\n", program_log);
196
+ free(program_log);
197
+ exit(1);
198
+ }
199
+
200
+ return p;
201
+ }
202
+
203
+ void ggml_cl_init(void) {
204
+ cl_int err = 0;
205
+ char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
206
+ char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
207
+ int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
208
+ int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
209
+ printf("\nInitializing CLBlast (First Run)...");
210
+ printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
211
+ cl_uint num_platforms;
212
+ clGetPlatformIDs(0, NULL, &num_platforms);
213
+ cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
214
+ clGetPlatformIDs(num_platforms, platforms, NULL);
215
+ platform = platforms[plat_num];
216
+ char platform_buffer[1024];
217
+ clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
218
+ cl_uint num_devices;
219
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
220
+ cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
221
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
222
+ device = devices[dev_num];
223
+ char device_buffer[1024];
224
+ clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
225
+ printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
226
+ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
227
+ CL_CHECK(err, "clCreateContext");
228
+ queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
229
+ CL_CHECK(err, "clCreateCommandQueue");
230
+
231
+ free(platforms);
232
+ free(devices);
233
+
234
+ program = build_program_from_source(context, device, clblast_dequant);
235
+
236
+ // Prepare dequantize kernels
237
+ kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
238
+ CL_CHECK(err, "clCreateKernel");
239
+ kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
240
+ CL_CHECK(err, "clCreateKernel");
241
+ kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
242
+ CL_CHECK(err, "clCreateKernel");
243
+ kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
244
+ CL_CHECK(err, "clCreateKernel");
245
+ kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
246
+ CL_CHECK(err, "clCreateKernel");
247
+ kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
248
+ CL_CHECK(err, "clCreateKernel");
249
+ }
250
+
251
+ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
252
+ if (req_size <= *cur_size) {
253
+ return;
254
+ }
255
+
256
+ // Reallocate buffer with enough space
257
+ if (*cur_size > 0) {
258
+ clReleaseMemObject(*buf);
259
+ }
260
+ cl_int err;
261
+ *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
262
+ *cur_size = req_size;
263
+ CL_CHECK(err, "clCreateBuffer");
264
+ }
265
+
266
+ void ggml_cl_sgemm_wrapper(
267
+ const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
268
+ const int m, const int n, const int k,
269
+ const float alpha, const void *host_a, const int lda,
270
+ const float *host_b, const int ldb, const float beta,
271
+ float *host_c, const int ldc, const int btype) {
272
+ cl_int err = 0;
273
+
274
+ cl_kernel kernel;
275
+ size_t global = n * k, local, size_qb;
276
+ bool dequant;
277
+ cl_block_q5_0* cl_host_b;
278
+
279
+ switch (btype) {
280
+ case GGML_TYPE_F32:
281
+ dequant = false;
282
+ break;
283
+ case GGML_TYPE_Q4_0:
284
+ dequant = true;
285
+ kernel = kernel_q4_0;
286
+ local = 16;
287
+ size_qb = global * (sizeof(float) + local) / 32;
288
+ break;
289
+ case GGML_TYPE_Q4_1:
290
+ dequant = true;
291
+ kernel = kernel_q4_1;
292
+ local = 16;
293
+ size_qb = global * (sizeof(float) * 2 + local) / 32;
294
+ break;
295
+ case GGML_TYPE_Q4_2:
296
+ dequant = true;
297
+ kernel = kernel_q4_2;
298
+ local = 8;
299
+ size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
300
+ break;
301
+ case GGML_TYPE_Q5_0:
302
+ dequant = true;
303
+ kernel = kernel_q5_0;
304
+ local = 16;
305
+ // For some reason OpenCL seems to be incapable of working with structs of size 22.
306
+ // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
307
+ // TODO Find the reason, fix and remove workaround.
308
+ const block_q5_0* b = (const block_q5_0*) host_b;
309
+ cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
310
+ for (size_t i = 0; i < global / 32; i++) {
311
+ cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
312
+ memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
313
+ memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
314
+ }
315
+ host_b = (const float*) cl_host_b;
316
+ size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
317
+ break;
318
+ case GGML_TYPE_Q5_1:
319
+ dequant = true;
320
+ kernel = kernel_q5_1;
321
+ local = 16;
322
+ size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
323
+ break;
324
+ case GGML_TYPE_Q8_0:
325
+ dequant = true;
326
+ kernel = kernel_q8_0;
327
+ local = 32;
328
+ size_qb = global * (sizeof(float) + local) / 32;
329
+ break;
330
+ default:
331
+ fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
332
+ abort();
333
+ }
334
+
335
+ const size_t size_a = m * k * sizeof(float);
336
+ const size_t size_b = n * k * sizeof(float);
337
+ const size_t size_c = m * n * sizeof(float);
338
+
339
+ // Prepare buffers
340
+ ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
341
+ if (dequant) {
342
+ ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
343
+ }
344
+ ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
345
+ ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
346
+
347
+ cl_event ev_a, ev_qb, ev_b;
348
+
349
+ if (dequant) {
350
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
351
+ err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
352
+ CL_CHECK(err, "clSetKernelArg");
353
+ err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
354
+ CL_CHECK(err, "clEnqueueWriteBuffer qb");
355
+ } else {
356
+ err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
357
+ CL_CHECK(err, "clEnqueueWriteBuffer b");
358
+ }
359
+
360
+ err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
361
+ CL_CHECK(err, "clEnqueueWriteBuffer a");
362
+ if (dequant) {
363
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
364
+ CL_CHECK(err, "clEnqueueNDRangeKernel");
365
+ clReleaseEvent(ev_qb);
366
+ }
367
+ clWaitForEvents(1, &ev_a);
368
+ clWaitForEvents(1, &ev_b);
369
+ clReleaseEvent(ev_a);
370
+ clReleaseEvent(ev_b);
371
+
372
+ cl_event ev_sgemm;
373
+ CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
374
+ (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
375
+ m, n, k,
376
+ alpha,
377
+ cl_buffer_a, 0, lda,
378
+ cl_buffer_b, 0, ldb,
379
+ beta,
380
+ cl_buffer_c, 0, ldc,
381
+ &queue, &ev_sgemm);
382
+
383
+ if (status != CLBlastSuccess) {
384
+ fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
385
+ abort();
386
+ }
387
+
388
+ cl_event ev_c;
389
+ clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
390
+
391
+ // Wait for completion
392
+ clWaitForEvents(1, &ev_c);
393
+ clReleaseEvent(ev_sgemm);
394
+ clReleaseEvent(ev_c);
395
+ if (btype == GGML_TYPE_Q5_0) {
396
+ free((void*) cl_host_b);
397
+ }
398
+ }
ggml-opencl.h ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #ifdef __cplusplus
4
+ extern "C" {
5
+ #endif
6
+
7
+ void ggml_cl_init(void);
8
+
9
+ enum ggml_blas_order {
10
+ GGML_BLAS_ORDER_ROW_MAJOR = 101,
11
+ GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
12
+ };
13
+
14
+ enum ggml_blas_op {
15
+ GGML_BLAS_OP_N = 111,
16
+ GGML_BLAS_OP_T = 112,
17
+ GGML_BLAS_OP_C = 113,
18
+ };
19
+
20
+ void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
21
+
22
+ #ifdef __cplusplus
23
+ }
24
+ #endif
ggml.c CHANGED
@@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
135
  #define UNUSED(x) (void)(x)
136
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
137
 
138
- #define GGML_ASSERT(x) \
139
- do { \
140
- if (!(x)) { \
141
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
142
- abort(); \
143
- } \
144
- } while (0)
145
-
146
  #if defined(GGML_USE_ACCELERATE)
147
  #include <Accelerate/Accelerate.h>
148
  #elif defined(GGML_USE_OPENBLAS)
@@ -370,6 +362,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
370
  return GGML_FP32_TO_FP16(x);
371
  }
372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  //
374
  // timing
375
  //
@@ -808,6 +826,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
808
  float max = 0.0f;
809
  float min = 0.0f;
810
 
 
811
  vector float srcv [8];
812
  vector float maxv[8];
813
  vector float minv[8];
@@ -4325,12 +4344,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4325
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4326
  }
4327
 
4328
- // initialize cuBLAS
4329
- #if defined(GGML_USE_CUBLAS)
4330
  ggml_init_cublas();
4331
- #elif defined(GGML_USE_CLBLAST)
4332
  ggml_cl_init();
4333
- #endif
4334
 
4335
  is_first_call = false;
4336
  }
@@ -4411,7 +4429,7 @@ void ggml_free(struct ggml_context * ctx) {
4411
  }
4412
 
4413
  size_t ggml_used_mem(const struct ggml_context * ctx) {
4414
- return ctx->objects_end->offs + ctx->objects_end->size;
4415
  }
4416
 
4417
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
@@ -4524,6 +4542,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4524
  /*.perf_cycles =*/ 0,
4525
  /*.perf_time_us =*/ 0,
4526
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
 
4527
  /*.pad =*/ { 0 },
4528
  };
4529
 
@@ -4878,6 +4897,15 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4878
  return (float *)(tensor->data);
4879
  }
4880
 
 
 
 
 
 
 
 
 
 
4881
  struct ggml_tensor * ggml_view_tensor(
4882
  struct ggml_context * ctx,
4883
  const struct ggml_tensor * src) {
@@ -5977,6 +6005,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
5977
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5978
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
5979
  struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
 
5980
 
5981
  result->op = GGML_OP_DIAG_MASK_INF;
5982
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6034,6 +6063,7 @@ struct ggml_tensor * ggml_rope(
6034
  ((int32_t *) b->data)[0] = n_past;
6035
  ((int32_t *) b->data)[1] = n_dims;
6036
  ((int32_t *) b->data)[2] = mode;
 
6037
 
6038
  result->op = GGML_OP_ROPE;
6039
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -8101,7 +8131,7 @@ static void ggml_compute_forward_rms_norm(
8101
 
8102
  // ggml_compute_forward_mul_mat
8103
 
8104
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8105
  // helper function to determine if it is better to use BLAS or not
8106
  // for large matrices, BLAS is faster
8107
  static bool ggml_compute_forward_mul_mat_use_blas(
@@ -8117,12 +8147,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
8117
  const int64_t ne1 = dst->ne[1];
8118
 
8119
  // TODO: find the optimal values for these
8120
- if (
8121
- #if !defined(GGML_USE_CUBLAS)
8122
- ggml_is_contiguous(src0) &&
8123
  ggml_is_contiguous(src1) &&
8124
- #endif
8125
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
8126
 
8127
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
8128
  return true;
@@ -8130,7 +8157,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
8130
 
8131
  return false;
8132
  }
8133
-
8134
  #endif
8135
 
8136
  static void ggml_compute_forward_mul_mat_f32(
@@ -8146,7 +8172,7 @@ static void ggml_compute_forward_mul_mat_f32(
8146
  const int64_t ne02 = src0->ne[2];
8147
  const int64_t ne03 = src0->ne[3];
8148
 
8149
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8150
  const int64_t ne10 = src1->ne[0];
8151
  #endif
8152
  const int64_t ne11 = src1->ne[1];
@@ -8203,7 +8229,16 @@ static void ggml_compute_forward_mul_mat_f32(
8203
  // nb01 >= nb00 - src0 is not transposed
8204
  // compute by src0 rows
8205
 
8206
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
 
 
 
 
 
 
 
 
 
8207
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8208
  if (params->ith != 0) {
8209
  return;
@@ -8217,43 +8252,13 @@ static void ggml_compute_forward_mul_mat_f32(
8217
  return;
8218
  }
8219
 
8220
- #if defined(GGML_USE_CUBLAS)
8221
- const float alpha = 1.0f;
8222
- const float beta = 0.0f;
8223
- const int x_ne = ne01 * ne00;
8224
- const int y_ne = ne11 * ne10;
8225
- const int d_ne = ne11 * ne01;
8226
-
8227
- size_t x_size, y_size, d_size;
8228
- float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8229
- float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8230
- float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8231
- #endif
8232
-
8233
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8234
  for (int64_t i02 = 0; i02 < ne02; i02++) {
8235
- #if !defined(GGML_USE_CUBLAS)
8236
  const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
8237
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
8238
- #endif
8239
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8240
 
8241
- #if defined(GGML_USE_CUBLAS)
8242
- // copy data to device
8243
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
8244
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
8245
-
8246
- // compute
8247
- CUBLAS_CHECK(
8248
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8249
- ne01, ne11, ne10,
8250
- &alpha, d_X, ne00,
8251
- d_Y, ne10,
8252
- &beta, d_D, ne01));
8253
-
8254
- // copy data to host
8255
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8256
- #elif defined(GGML_USE_CLBLAST)
8257
  // zT = y * xT
8258
  ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
8259
  ne11, ne01, ne10,
@@ -8270,12 +8275,6 @@ static void ggml_compute_forward_mul_mat_f32(
8270
  #endif
8271
  }
8272
  }
8273
- #if defined(GGML_USE_CUBLAS)
8274
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8275
- ggml_cuda_pool_free(d_X, x_size);
8276
- ggml_cuda_pool_free(d_Y, y_size);
8277
- ggml_cuda_pool_free(d_D, d_size);
8278
- #endif
8279
  //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
8280
 
8281
  return;
@@ -8405,7 +8404,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8405
  // nb01 >= nb00 - src0 is not transposed
8406
  // compute by src0 rows
8407
 
8408
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
 
 
 
 
 
 
 
 
 
8409
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8410
  GGML_ASSERT(nb10 == sizeof(float));
8411
 
@@ -8421,37 +8429,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8421
  return;
8422
  }
8423
 
8424
- #if defined(GGML_USE_CUBLAS)
8425
- const float alpha = 1.0f;
8426
- const float beta = 0.0f;
8427
- const int x_ne = ne01 * ne00;
8428
- const int y_ne = ne11 * ne10;
8429
- const int d_ne = ne11 * ne01;
8430
-
8431
- size_t x_size, y_size, d_size;
8432
- ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8433
- ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8434
- float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8435
- #endif
8436
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8437
  for (int64_t i02 = 0; i02 < ne02; i02++) {
8438
- #if defined(GGML_USE_CUBLAS)
8439
- // copy src0 while converting src1
8440
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
8441
-
8442
- // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8443
- ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
8444
- {
8445
- size_t id = 0;
8446
- for (int64_t i01 = 0; i01 < ne11; ++i01) {
8447
- for (int64_t i00 = 0; i00 < ne10; ++i00) {
8448
- wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
8449
- }
8450
- }
8451
-
8452
- assert(id*sizeof(ggml_fp16_t) <= params->wsize);
8453
- }
8454
- #else
8455
  float * const wdata = params->wdata;
8456
  {
8457
  size_t id = 0;
@@ -8463,28 +8442,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8463
 
8464
  assert(id*sizeof(float) <= params->wsize);
8465
  }
8466
- #endif
8467
 
8468
- #if defined(GGML_USE_CUBLAS)
8469
- const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
8470
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8471
-
8472
- // copy data to device
8473
- CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
8474
-
8475
- // compute
8476
- CUBLAS_CHECK(
8477
- cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8478
- ne01, ne11, ne10,
8479
- &alpha, d_X, CUDA_R_16F, ne00,
8480
- d_Y, CUDA_R_16F, ne10,
8481
- &beta, d_D, CUDA_R_32F, ne01,
8482
- CUBLAS_COMPUTE_32F,
8483
- CUBLAS_GEMM_DEFAULT));
8484
-
8485
- // copy data to host
8486
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8487
- #elif defined(GGML_USE_CLBLAST)
8488
  const float * x = wdata;
8489
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
8490
 
@@ -8513,12 +8472,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8513
  }
8514
  }
8515
 
8516
- #if defined(GGML_USE_CUBLAS)
8517
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8518
- ggml_cuda_pool_free(d_X, x_size);
8519
- ggml_cuda_pool_free(d_Y, y_size);
8520
- ggml_cuda_pool_free(d_D, d_size);
8521
- #endif
8522
  /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
8523
 
8524
  return;
@@ -8671,7 +8624,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
8671
  // nb01 >= nb00 - src0 is not transposed
8672
  // compute by src0 rows
8673
 
8674
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
 
 
 
 
 
 
 
 
 
8675
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8676
  if (params->ith != 0) {
8677
  return;
@@ -8685,25 +8647,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
8685
  return;
8686
  }
8687
 
8688
- #if defined(GGML_USE_CUBLAS)
8689
- const float alpha = 1.0f;
8690
- const float beta = 0.0f;
8691
- const int x_ne = ne01 * ne00;
8692
- const int y_ne = ne11 * ne10;
8693
- const int d_ne = ne11 * ne01;
8694
-
8695
- size_t x_size, y_size, d_size, q_size;
8696
- float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8697
- float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8698
- float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8699
- void * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
8700
-
8701
- const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
8702
- GGML_ASSERT(dequantize_row_q_cuda != NULL);
8703
- #else
8704
  float * const wdata = params->wdata;
8705
  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
8706
- #endif
8707
 
8708
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8709
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -8711,14 +8656,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
8711
 
8712
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8713
 
8714
- #if defined(GGML_USE_CUBLAS)
8715
- // copy and dequantize on device
8716
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2));
8717
-
8718
- dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
8719
- CUDA_CHECK(cudaGetLastError());
8720
- CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2));
8721
- #elif defined(GGML_USE_CLBLAST)
8722
  const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
8723
  #else
8724
  {
@@ -8734,24 +8672,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
8734
  const float * x = wdata;
8735
  #endif
8736
 
8737
- #if defined(GGML_USE_CUBLAS)
8738
- // copy data to device
8739
- CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
8740
-
8741
- // wait for dequantization
8742
- CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
8743
-
8744
- // compute
8745
- CUBLAS_CHECK(
8746
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8747
- ne01, ne11, ne10,
8748
- &alpha, d_X, ne00,
8749
- d_Y, ne10,
8750
- &beta, d_D, ne01));
8751
-
8752
- // copy data to host
8753
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8754
- #elif defined(GGML_USE_CLBLAST)
8755
  // zT = y * xT
8756
  ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
8757
  ne11, ne01, ne10,
@@ -8769,13 +8690,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
8769
  }
8770
  }
8771
 
8772
- #if defined(GGML_USE_CUBLAS)
8773
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8774
- ggml_cuda_pool_free(d_X, x_size);
8775
- ggml_cuda_pool_free(d_Y, y_size);
8776
- ggml_cuda_pool_free(d_D, d_size);
8777
- ggml_cuda_pool_free(d_Q, q_size);
8778
- #endif
8779
  //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
8780
 
8781
  return;
@@ -11759,18 +11673,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11759
 
11760
  size_t cur = 0;
11761
 
 
 
 
 
 
 
 
 
11762
  if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
11763
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11764
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11765
  node->n_tasks = 1; // TODO: this actually is doing nothing
11766
  // the threads are still spinning
11767
- #if defined(GGML_USE_CUBLAS)
11768
- // with cuBLAS, we need memory for the full 3D / 4D data of src1
11769
- cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
11770
- #else
11771
  // here we need memory just for single 2D matrix from src0
11772
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
11773
- #endif
11774
  } else {
11775
  cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
11776
  }
@@ -11779,13 +11696,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11779
  #endif
11780
  } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
11781
  cur = 0;
11782
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11783
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11784
  node->n_tasks = 1;
11785
  }
11786
  #endif
11787
  } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
11788
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11789
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11790
  node->n_tasks = 1;
11791
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -12214,10 +12131,16 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
12214
  snprintf(color, sizeof(color), "white");
12215
  }
12216
 
12217
- fprintf(fp, " \"%p\" [ \
12218
- style = filled; fillcolor = %s; shape = record; \
12219
- label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12220
- (void *) node, color,
 
 
 
 
 
 
12221
  i, node->ne[0], node->ne[1],
12222
  GGML_OP_SYMBOL[node->op]);
12223
 
@@ -12233,18 +12156,26 @@ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12233
 
12234
  snprintf(color, sizeof(color), "pink");
12235
 
 
 
 
 
 
 
 
 
12236
  if (ggml_nelements(node) == 1) {
12237
- fprintf(fp, " \"%p\" [ \
12238
- style = filled; fillcolor = %s; shape = record; \
12239
- label=\"<x>%.1e\"; ]\n",
12240
- (void *) node, color, (double)ggml_get_f32_1d(node, 0));
12241
- } else {
12242
- fprintf(fp, " \"%p\" [ \
12243
- style = filled; fillcolor = %s; shape = record; \
12244
- label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
12245
- (void *) node, color,
12246
- i, node->ne[0], node->ne[1]);
12247
  }
 
12248
  }
12249
 
12250
  for (int i = 0; i < gb->n_nodes; i++) {
 
135
  #define UNUSED(x) (void)(x)
136
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
137
 
 
 
 
 
 
 
 
 
138
  #if defined(GGML_USE_ACCELERATE)
139
  #include <Accelerate/Accelerate.h>
140
  #elif defined(GGML_USE_OPENBLAS)
 
362
  return GGML_FP32_TO_FP16(x);
363
  }
364
 
365
+ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
366
+ for (size_t i = 0; i < n; i++) {
367
+ y[i] = GGML_FP16_TO_FP32(x[i]);
368
+ }
369
+ }
370
+
371
+ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
372
+ size_t i = 0;
373
+ #if defined(__F16C__)
374
+ for (; i + 7 < n; i += 8) {
375
+ __m256 x_vec = _mm256_loadu_ps(x + i);
376
+ __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
377
+ _mm_storeu_si128((__m128i *)(y + i), y_vec);
378
+ }
379
+ for(; i + 3 < n; i += 4) {
380
+ __m128 x_vec = _mm_loadu_ps(x + i);
381
+ __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
382
+ _mm_storel_epi64((__m128i *)(y + i), y_vec);
383
+ }
384
+ #endif
385
+ for (; i < n; i++) {
386
+ y[i] = GGML_FP32_TO_FP16(x[i]);
387
+ }
388
+ }
389
+
390
+
391
  //
392
  // timing
393
  //
 
826
  float max = 0.0f;
827
  float min = 0.0f;
828
 
829
+ vector float asrcv [8];
830
  vector float srcv [8];
831
  vector float maxv[8];
832
  vector float minv[8];
 
4344
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4345
  }
4346
 
4347
+ #if defined(GGML_USE_CUBLAS)
 
4348
  ggml_init_cublas();
4349
+ #elif defined(GGML_USE_CLBLAST)
4350
  ggml_cl_init();
4351
+ #endif
4352
 
4353
  is_first_call = false;
4354
  }
 
4429
  }
4430
 
4431
  size_t ggml_used_mem(const struct ggml_context * ctx) {
4432
+ return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
4433
  }
4434
 
4435
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
 
4542
  /*.perf_cycles =*/ 0,
4543
  /*.perf_time_us =*/ 0,
4544
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4545
+ /*.name =*/ { 0 },
4546
  /*.pad =*/ { 0 },
4547
  };
4548
 
 
4897
  return (float *)(tensor->data);
4898
  }
4899
 
4900
+ const char * ggml_get_name(const struct ggml_tensor * tensor) {
4901
+ return tensor->name;
4902
+ }
4903
+
4904
+ void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4905
+ strncpy(tensor->name, name, sizeof(tensor->name));
4906
+ tensor->name[sizeof(tensor->name) - 1] = '\0';
4907
+ }
4908
+
4909
  struct ggml_tensor * ggml_view_tensor(
4910
  struct ggml_context * ctx,
4911
  const struct ggml_tensor * src) {
 
6005
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6006
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6007
  struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
6008
+ ggml_set_name(b, "n_past");
6009
 
6010
  result->op = GGML_OP_DIAG_MASK_INF;
6011
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6063
  ((int32_t *) b->data)[0] = n_past;
6064
  ((int32_t *) b->data)[1] = n_dims;
6065
  ((int32_t *) b->data)[2] = mode;
6066
+ ggml_set_name(b, "n_past, n_dims, mode");
6067
 
6068
  result->op = GGML_OP_ROPE;
6069
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
8131
 
8132
  // ggml_compute_forward_mul_mat
8133
 
8134
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8135
  // helper function to determine if it is better to use BLAS or not
8136
  // for large matrices, BLAS is faster
8137
  static bool ggml_compute_forward_mul_mat_use_blas(
 
8147
  const int64_t ne1 = dst->ne[1];
8148
 
8149
  // TODO: find the optimal values for these
8150
+ if (ggml_is_contiguous(src0) &&
 
 
8151
  ggml_is_contiguous(src1) &&
8152
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
 
8153
 
8154
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
8155
  return true;
 
8157
 
8158
  return false;
8159
  }
 
8160
  #endif
8161
 
8162
  static void ggml_compute_forward_mul_mat_f32(
 
8172
  const int64_t ne02 = src0->ne[2];
8173
  const int64_t ne03 = src0->ne[3];
8174
 
8175
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8176
  const int64_t ne10 = src1->ne[0];
8177
  #endif
8178
  const int64_t ne11 = src1->ne[1];
 
8229
  // nb01 >= nb00 - src0 is not transposed
8230
  // compute by src0 rows
8231
 
8232
+ #if defined(GGML_USE_CUBLAS)
8233
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8234
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8235
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8236
+ }
8237
+ return;
8238
+ }
8239
+ #endif
8240
+
8241
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8242
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8243
  if (params->ith != 0) {
8244
  return;
 
8252
  return;
8253
  }
8254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8255
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8256
  for (int64_t i02 = 0; i02 < ne02; i02++) {
 
8257
  const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
8258
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
 
8259
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8260
 
8261
+ #if defined(GGML_USE_CLBLAST)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8262
  // zT = y * xT
8263
  ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
8264
  ne11, ne01, ne10,
 
8275
  #endif
8276
  }
8277
  }
 
 
 
 
 
 
8278
  //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
8279
 
8280
  return;
 
8404
  // nb01 >= nb00 - src0 is not transposed
8405
  // compute by src0 rows
8406
 
8407
+ #if defined(GGML_USE_CUBLAS)
8408
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8409
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8410
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8411
+ }
8412
+ return;
8413
+ }
8414
+ #endif
8415
+
8416
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8417
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8418
  GGML_ASSERT(nb10 == sizeof(float));
8419
 
 
8429
  return;
8430
  }
8431
 
 
 
 
 
 
 
 
 
 
 
 
 
8432
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8433
  for (int64_t i02 = 0; i02 < ne02; i02++) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8434
  float * const wdata = params->wdata;
8435
  {
8436
  size_t id = 0;
 
8442
 
8443
  assert(id*sizeof(float) <= params->wsize);
8444
  }
 
8445
 
8446
+ #if defined(GGML_USE_CLBLAST)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8447
  const float * x = wdata;
8448
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
8449
 
 
8472
  }
8473
  }
8474
 
 
 
 
 
 
 
8475
  /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
8476
 
8477
  return;
 
8624
  // nb01 >= nb00 - src0 is not transposed
8625
  // compute by src0 rows
8626
 
8627
+ #if defined(GGML_USE_CUBLAS)
8628
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8629
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8630
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8631
+ }
8632
+ return;
8633
+ }
8634
+ #endif
8635
+
8636
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8637
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8638
  if (params->ith != 0) {
8639
  return;
 
8647
  return;
8648
  }
8649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8650
  float * const wdata = params->wdata;
8651
  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
 
8652
 
8653
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8654
  for (int64_t i02 = 0; i02 < ne02; i02++) {
 
8656
 
8657
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8658
 
8659
+ #if defined(GGML_USE_CLBLAST)
 
 
 
 
 
 
 
8660
  const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
8661
  #else
8662
  {
 
8672
  const float * x = wdata;
8673
  #endif
8674
 
8675
+ #if defined(GGML_USE_CLBLAST)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8676
  // zT = y * xT
8677
  ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
8678
  ne11, ne01, ne10,
 
8690
  }
8691
  }
8692
 
 
 
 
 
 
 
 
8693
  //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
8694
 
8695
  return;
 
11673
 
11674
  size_t cur = 0;
11675
 
11676
+ #if defined(GGML_USE_CUBLAS)
11677
+ if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
11678
+ node->n_tasks = 1; // TODO: this actually is doing nothing
11679
+ // the threads are still spinning
11680
+ cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
11681
+ }
11682
+ else
11683
+ #endif
11684
  if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
11685
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11686
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11687
  node->n_tasks = 1; // TODO: this actually is doing nothing
11688
  // the threads are still spinning
 
 
 
 
11689
  // here we need memory just for single 2D matrix from src0
11690
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
 
11691
  } else {
11692
  cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
11693
  }
 
11696
  #endif
11697
  } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
11698
  cur = 0;
11699
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11700
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11701
  node->n_tasks = 1;
11702
  }
11703
  #endif
11704
  } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
11705
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11706
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11707
  node->n_tasks = 1;
11708
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
 
12131
  snprintf(color, sizeof(color), "white");
12132
  }
12133
 
12134
+ fprintf(fp, " \"%p\" [ "
12135
+ "style = filled; fillcolor = %s; shape = record; "
12136
+ "label=\"",
12137
+ (void *) node, color);
12138
+
12139
+ if (strlen(node->name) > 0) {
12140
+ fprintf(fp, "%s |", node->name);
12141
+ }
12142
+
12143
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12144
  i, node->ne[0], node->ne[1],
12145
  GGML_OP_SYMBOL[node->op]);
12146
 
 
12156
 
12157
  snprintf(color, sizeof(color), "pink");
12158
 
12159
+ fprintf(fp, " \"%p\" [ "
12160
+ "style = filled; fillcolor = %s; shape = record; "
12161
+ "label=\"<x>",
12162
+ (void *) node, color);
12163
+
12164
+ if (strlen(node->name) > 0) {
12165
+ fprintf(fp, "%s | ", node->name);
12166
+ }
12167
  if (ggml_nelements(node) == 1) {
12168
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
12169
+ fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
12170
+ }
12171
+ else {
12172
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
12173
+ }
12174
+ }
12175
+ else {
12176
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
 
12177
  }
12178
+ fprintf(fp, "\"; ]\n");
12179
  }
12180
 
12181
  for (int i = 0; i < gb->n_nodes; i++) {
ggml.h CHANGED
@@ -197,6 +197,14 @@
197
  #define GGML_MAX_OPT 4
198
  #define GGML_DEFAULT_N_THREADS 4
199
 
 
 
 
 
 
 
 
 
200
  #ifdef __cplusplus
201
  extern "C" {
202
  #endif
@@ -212,6 +220,9 @@ extern "C" {
212
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
 
 
 
 
215
  struct ggml_object;
216
  struct ggml_context;
217
 
@@ -339,7 +350,10 @@ extern "C" {
339
  int64_t perf_time_us;
340
 
341
  void * data;
342
- char padding[8];
 
 
 
343
  };
344
 
345
  // computation graph
@@ -399,6 +413,7 @@ extern "C" {
399
 
400
  GGML_API bool ggml_is_quantized(enum ggml_type type);
401
 
 
402
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
403
 
404
  // main
@@ -461,6 +476,9 @@ extern "C" {
461
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
462
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
463
 
 
 
 
464
  //
465
  // operations on tensors with backpropagation
466
  //
 
197
  #define GGML_MAX_OPT 4
198
  #define GGML_DEFAULT_N_THREADS 4
199
 
200
+ #define GGML_ASSERT(x) \
201
+ do { \
202
+ if (!(x)) { \
203
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
204
+ abort(); \
205
+ } \
206
+ } while (0)
207
+
208
  #ifdef __cplusplus
209
  extern "C" {
210
  #endif
 
220
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
221
  GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
222
 
223
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
224
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
225
+
226
  struct ggml_object;
227
  struct ggml_context;
228
 
 
350
  int64_t perf_time_us;
351
 
352
  void * data;
353
+
354
+ char name[32];
355
+
356
+ char padding[8]; // TODO: remove and add padding to name?
357
  };
358
 
359
  // computation graph
 
413
 
414
  GGML_API bool ggml_is_quantized(enum ggml_type type);
415
 
416
+ // TODO: temporary until model loading of ggml examples is refactored
417
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
418
 
419
  // main
 
476
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
477
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
478
 
479
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
480
+ GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
481
+
482
  //
483
  // operations on tensors with backpropagation
484
  //