ggerganov commited on
Commit
ede1268
·
unverified ·
1 Parent(s): dd2c035

ggml : sync latest ggml + llama.cpp updates (quantization)

Browse files
Files changed (4) hide show
  1. ggml-cuda.cu +365 -0
  2. ggml-cuda.h +54 -0
  3. ggml.c +0 -0
  4. ggml.h +674 -617
ggml-cuda.cu ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <stdint.h>
2
+ #include <stdio.h>
3
+ #include <cuda_fp16.h>
4
+ #include <atomic>
5
+ #include "ggml-cuda.h"
6
+
7
+ typedef uint16_t ggml_fp16_t;
8
+ static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
9
+
10
+ #define QK4_0 32
11
+ typedef struct {
12
+ float d; // delta
13
+ uint8_t qs[QK4_0 / 2]; // nibbles / quants
14
+ } block_q4_0;
15
+ static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
16
+
17
+ #define QK4_1 32
18
+ typedef struct {
19
+ float d; // delta
20
+ float m; // min
21
+ uint8_t qs[QK4_1 / 2]; // nibbles / quants
22
+ } block_q4_1;
23
+ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
24
+
25
+ #define QK4_2 16
26
+ typedef struct {
27
+ __half d; // delta
28
+ uint8_t qs[QK4_2 / 2]; // nibbles / quants
29
+ } block_q4_2;
30
+ static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
31
+
32
+ #define QK5_0 32
33
+ typedef struct {
34
+ __half d; // delta
35
+ uint8_t qh[4]; // 5-th bit of quants
36
+ uint8_t qs[QK5_0 / 2]; // nibbles / quants
37
+ } block_q5_0;
38
+ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
39
+
40
+ #define QK5_1 32
41
+ typedef struct {
42
+ __half d; // delta
43
+ __half m; // min
44
+ uint32_t qh; // 5-th bit of quants
45
+ uint8_t qs[QK5_1 / 2]; // nibbles / quants
46
+ } block_q5_1;
47
+ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
48
+
49
+ #define QK8_0 32
50
+ typedef struct {
51
+ float d; // delta
52
+ int8_t qs[QK8_0]; // quants
53
+ } block_q8_0;
54
+ static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
55
+
56
+ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
57
+ const block_q4_0 * x = (const block_q4_0 *) vx;
58
+
59
+ const int i = blockIdx.x;
60
+
61
+ const float d = x[i].d;
62
+
63
+ const uint8_t * pp = x[i].qs;
64
+
65
+ for (int l = 0; l < QK4_0; l += 2) {
66
+ const uint8_t vi = pp[l/2];
67
+
68
+ const int8_t vi0 = vi & 0xf;
69
+ const int8_t vi1 = vi >> 4;
70
+
71
+ const float v0 = (vi0 - 8)*d;
72
+ const float v1 = (vi1 - 8)*d;
73
+
74
+ y[i*QK4_0 + l + 0] = v0;
75
+ y[i*QK4_0 + l + 1] = v1;
76
+ }
77
+ }
78
+
79
+ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
80
+ const block_q4_1 * x = (const block_q4_1 *) vx;
81
+
82
+ const int i = blockIdx.x;
83
+
84
+ const float d = x[i].d;
85
+ const float m = x[i].m;
86
+
87
+ const uint8_t * pp = x[i].qs;
88
+
89
+ for (int l = 0; l < QK4_1; l += 2) {
90
+ const uint8_t vi = pp[l/2];
91
+
92
+ const int8_t vi0 = vi & 0xf;
93
+ const int8_t vi1 = vi >> 4;
94
+
95
+ const float v0 = vi0*d + m;
96
+ const float v1 = vi1*d + m;
97
+
98
+ y[i*QK4_1 + l + 0] = v0;
99
+ y[i*QK4_1 + l + 1] = v1;
100
+ }
101
+ }
102
+
103
+ static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
104
+ const block_q4_2 * x = (const block_q4_2 *) vx;
105
+
106
+ const int i = blockIdx.x;
107
+
108
+ const float d = x[i].d;
109
+
110
+ const uint8_t * pp = x[i].qs;
111
+
112
+ for (int l = 0; l < QK4_2; l += 2) {
113
+ const uint8_t vi = pp[l/2];
114
+
115
+ const int8_t vi0 = vi & 0xf;
116
+ const int8_t vi1 = vi >> 4;
117
+
118
+ const float v0 = (vi0 - 8)*d;
119
+ const float v1 = (vi1 - 8)*d;
120
+
121
+ y[i*QK4_2 + l + 0] = v0;
122
+ y[i*QK4_2 + l + 1] = v1;
123
+ }
124
+ }
125
+
126
+ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
127
+ const block_q5_0 * x = (const block_q5_0 *) vx;
128
+
129
+ const int i = blockIdx.x;
130
+
131
+ const float d = x[i].d;
132
+
133
+ const uint8_t * pp = x[i].qs;
134
+
135
+ uint32_t qh;
136
+ memcpy(&qh, x[i].qh, sizeof(qh));
137
+
138
+ for (int l = 0; l < QK5_0; l += 2) {
139
+ const uint8_t vi = pp[l/2];
140
+
141
+ const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
142
+ const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
143
+
144
+ const int8_t vi0 = ((vi & 0xf) | vh0);
145
+ const int8_t vi1 = ((vi >> 4) | vh1);
146
+
147
+ const float v0 = (vi0 - 16)*d;
148
+ const float v1 = (vi1 - 16)*d;
149
+
150
+ y[i*QK5_0 + l + 0] = v0;
151
+ y[i*QK5_0 + l + 1] = v1;
152
+ }
153
+ }
154
+
155
+ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
156
+ const block_q5_1 * x = (const block_q5_1 *) vx;
157
+
158
+ const int i = blockIdx.x;
159
+
160
+ const float d = x[i].d;
161
+ const float m = x[i].m;
162
+
163
+ const uint8_t * pp = x[i].qs;
164
+
165
+ const uint32_t qh = x[i].qh;
166
+
167
+ for (int l = 0; l < QK5_1; l += 2) {
168
+ const uint8_t vi = pp[l/2];
169
+
170
+ const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
171
+ const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
172
+
173
+ const int8_t vi0 = (vi & 0xf) | vh0;
174
+ const int8_t vi1 = (vi >> 4) | vh1;
175
+
176
+ const float v0 = vi0*d + m;
177
+ const float v1 = vi1*d + m;
178
+
179
+ y[i*QK5_1 + l + 0] = v0;
180
+ y[i*QK5_1 + l + 1] = v1;
181
+ }
182
+ }
183
+
184
+ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
185
+ const block_q8_0 * x = (const block_q8_0 *) vx;
186
+
187
+ const int i = blockIdx.x;
188
+
189
+ const float d = x[i].d;
190
+
191
+ const int8_t * pp = x[i].qs;
192
+
193
+ for (int l = 0; l < QK8_0; l++) {
194
+ const int8_t vi = pp[l];
195
+
196
+ y[i*QK8_0 + l] = vi*d;
197
+ }
198
+ }
199
+
200
+ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
201
+ const int nb = k / QK4_0;
202
+ dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
203
+ }
204
+
205
+ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
206
+ const int nb = k / QK4_1;
207
+ dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
208
+ }
209
+
210
+ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
211
+ const int nb = k / QK4_2;
212
+ dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
213
+ }
214
+
215
+ void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
216
+ const int nb = k / QK5_0;
217
+ dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
218
+ }
219
+
220
+ void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
221
+ const int nb = k / QK5_1;
222
+ dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
223
+ }
224
+
225
+ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
226
+ const int nb = k / QK8_0;
227
+ dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
228
+ }
229
+
230
+ dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
231
+ switch (type) {
232
+ case GGML_TYPE_Q4_0:
233
+ return dequantize_row_q4_0_cuda;
234
+ case GGML_TYPE_Q4_1:
235
+ return dequantize_row_q4_1_cuda;
236
+ case GGML_TYPE_Q4_2:
237
+ return dequantize_row_q4_2_cuda;
238
+ case GGML_TYPE_Q5_0:
239
+ return dequantize_row_q5_0_cuda;
240
+ case GGML_TYPE_Q5_1:
241
+ return dequantize_row_q5_1_cuda;
242
+ case GGML_TYPE_Q8_0:
243
+ return dequantize_row_q8_0_cuda;
244
+ default:
245
+ return nullptr;
246
+ }
247
+ }
248
+
249
+ // buffer pool for cuda
250
+ #define MAX_CUDA_BUFFERS 16
251
+
252
+ struct scoped_spin_lock {
253
+ std::atomic_flag& lock;
254
+ scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
255
+ while (lock.test_and_set(std::memory_order_acquire)) {
256
+ ; // spin
257
+ }
258
+ }
259
+ ~scoped_spin_lock() {
260
+ lock.clear(std::memory_order_release);
261
+ }
262
+ scoped_spin_lock(const scoped_spin_lock&) = delete;
263
+ scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
264
+ };
265
+
266
+ struct cuda_buffer {
267
+ void * ptr = nullptr;
268
+ size_t size = 0;
269
+ };
270
+
271
+ static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
272
+ static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
273
+
274
+ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
275
+ scoped_spin_lock lock(g_cuda_pool_lock);
276
+
277
+ for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
278
+ cuda_buffer& b = g_cuda_buffer_pool[i];
279
+ if (b.size >= size && b.ptr != nullptr) {
280
+ void * ptr = b.ptr;
281
+ *actual_size = b.size;
282
+ b.ptr = nullptr;
283
+ b.size = 0;
284
+ return ptr;
285
+ }
286
+ }
287
+ void * ptr;
288
+ CUDA_CHECK(cudaMalloc((void **) &ptr, size));
289
+ *actual_size = size;
290
+ return ptr;
291
+ }
292
+
293
+ void ggml_cuda_pool_free(void * ptr, size_t size) {
294
+ scoped_spin_lock lock(g_cuda_pool_lock);
295
+
296
+ for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
297
+ cuda_buffer& b = g_cuda_buffer_pool[i];
298
+ if (b.ptr == nullptr) {
299
+ b.ptr = ptr;
300
+ b.size = size;
301
+ return;
302
+ }
303
+ }
304
+ fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
305
+ CUDA_CHECK(cudaFree(ptr));
306
+ }
307
+
308
+ cublasHandle_t g_cublasH = nullptr;
309
+ cudaStream_t g_cudaStream = nullptr;
310
+ cudaStream_t g_cudaStream2 = nullptr;
311
+ cudaEvent_t g_cudaEvent = nullptr;
312
+
313
+ void ggml_init_cublas() {
314
+ if (g_cublasH == nullptr) {
315
+ // create cublas handle, bind a stream
316
+ CUBLAS_CHECK(cublasCreate(&g_cublasH));
317
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
318
+ CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
319
+
320
+ // create additional stream and event for synchronization
321
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
322
+ CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
323
+
324
+ // configure logging to stdout
325
+ // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
326
+ }
327
+ }
328
+
329
+ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
330
+ const uint64_t ne0 = src->ne[0];
331
+ const uint64_t ne1 = src->ne[1];
332
+ const uint64_t nb0 = src->nb[0];
333
+ const uint64_t nb1 = src->nb[1];
334
+ const uint64_t nb2 = src->nb[2];
335
+ const uint64_t nb3 = src->nb[3];
336
+ const enum ggml_type type = src->type;
337
+ const size_t ts = ggml_type_size(type);
338
+ const size_t bs = ggml_blck_size(type);
339
+
340
+ const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
341
+ if (nb0 == ts && nb1 == ts*ne0/bs) {
342
+ return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
343
+ } else if (nb0 == ts) {
344
+ return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
345
+ } else {
346
+ for (uint64_t i1 = 0; i1 < ne1; i1++) {
347
+ const void * rx = (const void *) ((const char *) x + i1*nb1);
348
+ void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
349
+ // pretend the row is a matrix with cols=1
350
+ cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
351
+ if (r != cudaSuccess) return r;
352
+ }
353
+ return cudaSuccess;
354
+ }
355
+ }
356
+
357
+ void * ggml_cuda_host_malloc(size_t size) {
358
+ void * ptr;
359
+ CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
360
+ return ptr;
361
+ }
362
+
363
+ void ggml_cuda_host_free(void * ptr) {
364
+ CUDA_CHECK(cudaFreeHost(ptr));
365
+ }
ggml-cuda.h ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <cublas_v2.h>
2
+ #include <cuda_runtime.h>
3
+ #include "ggml.h"
4
+
5
+ #ifdef __cplusplus
6
+ extern "C" {
7
+ #endif
8
+
9
+ #define CUDA_CHECK(err) \
10
+ do { \
11
+ cudaError_t err_ = (err); \
12
+ if (err_ != cudaSuccess) { \
13
+ fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
14
+ cudaGetErrorString(err_)); \
15
+ exit(1); \
16
+ } \
17
+ } while (0)
18
+
19
+ #define CUBLAS_CHECK(err) \
20
+ do { \
21
+ cublasStatus_t err_ = (err); \
22
+ if (err_ != CUBLAS_STATUS_SUCCESS) { \
23
+ fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
24
+ exit(1); \
25
+ } \
26
+ } while (0)
27
+
28
+ extern cublasHandle_t g_cublasH;
29
+ extern cudaStream_t g_cudaStream;
30
+ extern cudaStream_t g_cudaStream2;
31
+ extern cudaEvent_t g_cudaEvent;
32
+
33
+ void ggml_init_cublas(void);
34
+ void * ggml_cuda_host_malloc(size_t size);
35
+ void ggml_cuda_host_free(void * ptr);
36
+
37
+ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
38
+ void ggml_cuda_pool_free(void * ptr, size_t size);
39
+
40
+ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
41
+ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
42
+ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
43
+ void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
44
+ void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
45
+ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
46
+
47
+ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
48
+
49
+ typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
50
+ dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
51
+
52
+ #ifdef __cplusplus
53
+ }
54
+ #endif
ggml.c CHANGED
The diff for this file is too large to render. See raw diff
 
ggml.h CHANGED
@@ -169,14 +169,27 @@
169
  //
170
  //
171
 
172
- #ifdef __cplusplus
173
- extern "C" {
 
 
 
 
 
 
 
 
 
 
174
  #endif
175
 
176
  #include <stdint.h>
177
  #include <stddef.h>
178
  #include <stdbool.h>
179
 
 
 
 
180
  #define GGML_MAX_DIMS 4
181
  #define GGML_MAX_NODES 4096
182
  #define GGML_MAX_PARAMS 16
@@ -184,660 +197,704 @@ extern "C" {
184
  #define GGML_MAX_OPT 4
185
  #define GGML_DEFAULT_N_THREADS 4
186
 
 
 
 
 
187
  #ifdef __ARM_NEON
188
- // we use the built-in 16-bit float type
189
- typedef __fp16 ggml_fp16_t;
190
  #else
191
- typedef uint16_t ggml_fp16_t;
192
  #endif
193
 
194
- // convert FP16 <-> FP32
195
- float ggml_fp16_to_fp32(ggml_fp16_t x);
196
- ggml_fp16_t ggml_fp32_to_fp16(float x);
197
-
198
- struct ggml_object;
199
- struct ggml_context;
200
-
201
- enum ggml_type {
202
- // explicitly numbered values are used in llama.cpp files
203
- GGML_TYPE_F32 = 0,
204
- GGML_TYPE_F16 = 1,
205
- GGML_TYPE_Q4_0 = 2,
206
- GGML_TYPE_Q4_1 = 3,
207
- GGML_TYPE_I8,
208
- GGML_TYPE_I16,
209
- GGML_TYPE_I32,
210
- GGML_TYPE_COUNT,
211
- };
212
-
213
- // available tensor operations:
214
- enum ggml_op {
215
- GGML_OP_NONE = 0,
216
-
217
- GGML_OP_DUP,
218
- GGML_OP_ADD,
219
- GGML_OP_SUB,
220
- GGML_OP_MUL,
221
- GGML_OP_DIV,
222
- GGML_OP_SQR,
223
- GGML_OP_SQRT,
224
- GGML_OP_SUM,
225
- GGML_OP_MEAN,
226
- GGML_OP_REPEAT,
227
- GGML_OP_ABS,
228
- GGML_OP_SGN,
229
- GGML_OP_NEG,
230
- GGML_OP_STEP,
231
- GGML_OP_RELU,
232
- GGML_OP_GELU,
233
- GGML_OP_SILU,
234
- GGML_OP_NORM, // normalize
235
- GGML_OP_RMS_NORM,
236
-
237
- GGML_OP_MUL_MAT,
238
-
239
- GGML_OP_SCALE,
240
- GGML_OP_CPY,
241
- GGML_OP_CONT,
242
- GGML_OP_RESHAPE,
243
- GGML_OP_VIEW,
244
- GGML_OP_PERMUTE,
245
- GGML_OP_TRANSPOSE,
246
- GGML_OP_GET_ROWS,
247
- GGML_OP_DIAG_MASK_INF,
248
- GGML_OP_SOFT_MAX,
249
- GGML_OP_ROPE,
250
- GGML_OP_CONV_1D_1S,
251
- GGML_OP_CONV_1D_2S,
252
-
253
- GGML_OP_FLASH_ATTN,
254
- GGML_OP_FLASH_FF,
255
-
256
- GGML_OP_MAP_UNARY,
257
- GGML_OP_MAP_BINARY,
258
-
259
- GGML_OP_COUNT,
260
- };
261
-
262
-
263
- // ggml object
264
- struct ggml_object {
265
- size_t offs;
266
- size_t size;
267
-
268
- struct ggml_object * next;
269
-
270
- char padding[8];
271
- };
272
-
273
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
274
-
275
- // n-dimensional tensor
276
- struct ggml_tensor {
277
- enum ggml_type type;
278
-
279
- int n_dims;
280
- int64_t ne[GGML_MAX_DIMS]; // number of elements
281
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
282
- // nb[0] = sizeof(type)
283
- // nb[1] = nb[0] * ne[0] + padding
284
- // nb[i] = nb[i-1] * ne[i-1]
285
-
286
- // compute data
287
- enum ggml_op op;
288
-
289
- bool is_param;
290
-
291
- struct ggml_tensor * grad;
292
- struct ggml_tensor * src0;
293
- struct ggml_tensor * src1;
294
- struct ggml_tensor * opt[GGML_MAX_OPT];
295
-
296
- // thread scheduling
297
- int n_tasks;
298
-
299
- // performance
300
- int perf_runs;
301
- int64_t perf_cycles;
302
- int64_t perf_time_us;
303
-
304
- void * data;
305
- char padding[8];
306
- };
307
-
308
- // computation graph
309
- struct ggml_cgraph {
310
- int n_nodes;
311
- int n_leafs;
312
- int n_threads;
313
-
314
- size_t work_size;
315
- struct ggml_tensor * work;
316
-
317
- struct ggml_tensor * nodes[GGML_MAX_NODES];
318
- struct ggml_tensor * grads[GGML_MAX_NODES];
319
- struct ggml_tensor * leafs[GGML_MAX_NODES];
320
-
321
- // performance
322
- int perf_runs;
323
- int64_t perf_cycles;
324
- int64_t perf_time_us;
325
- };
326
-
327
- // scratch buffer
328
- struct ggml_scratch {
329
- size_t offs;
330
- size_t size;
331
- void * data;
332
- };
333
-
334
- struct ggml_init_params {
335
- // memory pool
336
- size_t mem_size; // bytes
337
- void * mem_buffer; // if NULL, memory will be allocated internally
338
- bool no_alloc; // don't allocate memory for the tensor data
339
- };
340
-
341
- void ggml_time_init(void); // call this once at the beginning of the program
342
- int64_t ggml_time_ms(void);
343
- int64_t ggml_time_us(void);
344
- int64_t ggml_cycles(void);
345
- int64_t ggml_cycles_per_ms(void);
346
 
347
- void ggml_print_object (const struct ggml_object * obj);
348
- void ggml_print_objects(const struct ggml_context * ctx);
 
 
 
 
349
 
350
- int64_t ggml_nelements(const struct ggml_tensor * tensor);
351
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
352
 
353
- int ggml_blck_size (enum ggml_type type);
354
- size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
355
- float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 
 
356
 
357
- size_t ggml_element_size(const struct ggml_tensor * tensor);
 
358
 
359
- struct ggml_context * ggml_init(struct ggml_init_params params);
360
- void ggml_free(struct ggml_context * ctx);
361
 
362
- size_t ggml_used_mem(const struct ggml_context * ctx);
363
-
364
- size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
365
-
366
- struct ggml_tensor * ggml_new_tensor(
367
- struct ggml_context * ctx,
368
- enum ggml_type type,
369
- int n_dims,
370
- const int64_t *ne);
371
-
372
- struct ggml_tensor * ggml_new_tensor_1d(
373
- struct ggml_context * ctx,
374
- enum ggml_type type,
375
- int64_t ne0);
376
-
377
- struct ggml_tensor * ggml_new_tensor_2d(
378
- struct ggml_context * ctx,
379
- enum ggml_type type,
380
- int64_t ne0,
381
- int64_t ne1);
382
-
383
- struct ggml_tensor * ggml_new_tensor_3d(
384
- struct ggml_context * ctx,
385
- enum ggml_type type,
386
- int64_t ne0,
387
- int64_t ne1,
388
- int64_t ne2);
389
-
390
- struct ggml_tensor * ggml_new_tensor_4d(
391
- struct ggml_context * ctx,
392
- enum ggml_type type,
393
- int64_t ne0,
394
- int64_t ne1,
395
- int64_t ne2,
396
- int64_t ne3);
397
-
398
- struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
399
- struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
400
-
401
- struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
402
- struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
403
-
404
- struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
405
- struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
406
- struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
407
-
408
- int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
409
- void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
410
-
411
- float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
412
- void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
413
-
414
- void * ggml_get_data (const struct ggml_tensor * tensor);
415
- float * ggml_get_data_f32(const struct ggml_tensor * tensor);
416
-
417
- //
418
- // operations on tensors with backpropagation
419
- //
420
-
421
- struct ggml_tensor * ggml_dup(
422
- struct ggml_context * ctx,
423
- struct ggml_tensor * a);
424
-
425
- struct ggml_tensor * ggml_add(
426
- struct ggml_context * ctx,
427
- struct ggml_tensor * a,
428
- struct ggml_tensor * b);
429
-
430
- struct ggml_tensor * ggml_sub(
431
- struct ggml_context * ctx,
432
- struct ggml_tensor * a,
433
- struct ggml_tensor * b);
434
-
435
- struct ggml_tensor * ggml_mul(
436
- struct ggml_context * ctx,
437
- struct ggml_tensor * a,
438
- struct ggml_tensor * b);
439
-
440
- struct ggml_tensor * ggml_div(
441
- struct ggml_context * ctx,
442
- struct ggml_tensor * a,
443
- struct ggml_tensor * b);
444
-
445
- struct ggml_tensor * ggml_sqr(
446
- struct ggml_context * ctx,
447
- struct ggml_tensor * a);
448
-
449
- struct ggml_tensor * ggml_sqrt(
450
- struct ggml_context * ctx,
451
- struct ggml_tensor * a);
452
-
453
- // return scalar
454
- // TODO: compute sum along rows
455
- struct ggml_tensor * ggml_sum(
456
- struct ggml_context * ctx,
457
- struct ggml_tensor * a);
458
-
459
- // mean along rows
460
- struct ggml_tensor * ggml_mean(
461
- struct ggml_context * ctx,
462
- struct ggml_tensor * a);
463
-
464
- // if a is the same shape as b, and a is not parameter, return a
465
- // otherwise, return a new tensor: repeat(a) to fit in b
466
- struct ggml_tensor * ggml_repeat(
467
- struct ggml_context * ctx,
468
- struct ggml_tensor * a,
469
- struct ggml_tensor * b);
470
-
471
- struct ggml_tensor * ggml_abs(
472
- struct ggml_context * ctx,
473
- struct ggml_tensor * a);
474
-
475
- struct ggml_tensor * ggml_sgn(
476
- struct ggml_context * ctx,
477
- struct ggml_tensor * a);
478
-
479
- struct ggml_tensor * ggml_neg(
480
- struct ggml_context * ctx,
481
- struct ggml_tensor * a);
482
-
483
- struct ggml_tensor * ggml_step(
484
- struct ggml_context * ctx,
485
- struct ggml_tensor * a);
486
-
487
- struct ggml_tensor * ggml_relu(
488
- struct ggml_context * ctx,
489
- struct ggml_tensor * a);
490
-
491
- // TODO: double-check this computation is correct
492
- struct ggml_tensor * ggml_gelu(
493
- struct ggml_context * ctx,
494
- struct ggml_tensor * a);
495
-
496
- struct ggml_tensor * ggml_silu(
497
- struct ggml_context * ctx,
498
- struct ggml_tensor * a);
499
-
500
- // normalize along rows
501
- // TODO: eps is hardcoded to 1e-5 for now
502
- struct ggml_tensor * ggml_norm(
503
- struct ggml_context * ctx,
504
- struct ggml_tensor * a);
505
-
506
- struct ggml_tensor * ggml_rms_norm(
507
- struct ggml_context * ctx,
508
- struct ggml_tensor * a);
509
-
510
- // A: m rows, n columns
511
- // B: p rows, n columns (i.e. we transpose it internally)
512
- // result is m columns, p rows
513
- struct ggml_tensor * ggml_mul_mat(
514
- struct ggml_context * ctx,
515
- struct ggml_tensor * a,
516
- struct ggml_tensor * b);
517
-
518
- //
519
- // operations on tensors without backpropagation
520
- //
521
-
522
- // in-place, returns view(a)
523
- struct ggml_tensor * ggml_scale(
524
- struct ggml_context * ctx,
525
- struct ggml_tensor * a,
526
- struct ggml_tensor * b);
527
-
528
- // a -> b, return view(b)
529
- struct ggml_tensor * ggml_cpy(
530
- struct ggml_context * ctx,
531
- struct ggml_tensor * a,
532
- struct ggml_tensor * b);
533
-
534
- // make contiguous
535
- struct ggml_tensor * ggml_cont(
536
- struct ggml_context * ctx,
537
- struct ggml_tensor * a);
538
-
539
- // return view(a), b specifies the new shape
540
- // TODO: when we start computing gradient, make a copy instead of view
541
- struct ggml_tensor * ggml_reshape(
542
- struct ggml_context * ctx,
543
- struct ggml_tensor * a,
544
- struct ggml_tensor * b);
545
-
546
- // return view(a)
547
- // TODO: when we start computing gradient, make a copy instead of view
548
- struct ggml_tensor * ggml_reshape_2d(
549
- struct ggml_context * ctx,
550
- struct ggml_tensor * a,
551
- int64_t ne0,
552
- int64_t ne1);
553
-
554
- // return view(a)
555
- // TODO: when we start computing gradient, make a copy instead of view
556
- struct ggml_tensor * ggml_reshape_3d(
557
- struct ggml_context * ctx,
558
- struct ggml_tensor * a,
559
- int64_t ne0,
560
- int64_t ne1,
561
- int64_t ne2);
562
-
563
- // offset in bytes
564
- struct ggml_tensor * ggml_view_1d(
565
- struct ggml_context * ctx,
566
- struct ggml_tensor * a,
567
- int64_t ne0,
568
- size_t offset);
569
-
570
- struct ggml_tensor * ggml_view_2d(
571
- struct ggml_context * ctx,
572
- struct ggml_tensor * a,
573
- int64_t ne0,
574
- int64_t ne1,
575
- size_t nb1, // row stride in bytes
576
- size_t offset);
577
-
578
- struct ggml_tensor * ggml_view_3d(
579
- struct ggml_context * ctx,
580
- struct ggml_tensor * a,
581
- int64_t ne0,
582
- int64_t ne1,
583
- int64_t ne2,
584
- size_t nb1, // row stride in bytes
585
- size_t nb2, // slice stride in bytes
586
- size_t offset);
587
-
588
- struct ggml_tensor * ggml_permute(
589
- struct ggml_context * ctx,
590
- struct ggml_tensor * a,
591
- int axis0,
592
- int axis1,
593
- int axis2,
594
- int axis3);
595
-
596
- // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
597
- struct ggml_tensor * ggml_transpose(
598
- struct ggml_context * ctx,
599
- struct ggml_tensor * a);
600
-
601
- struct ggml_tensor * ggml_get_rows(
602
- struct ggml_context * ctx,
603
- struct ggml_tensor * a,
604
- struct ggml_tensor * b);
605
-
606
- // set elements above the diagonal to -INF
607
- // in-place, returns view(a)
608
- struct ggml_tensor * ggml_diag_mask_inf(
609
- struct ggml_context * ctx,
610
- struct ggml_tensor * a,
611
- int n_past);
612
-
613
- // in-place, returns view(a)
614
- struct ggml_tensor * ggml_soft_max(
615
- struct ggml_context * ctx,
616
- struct ggml_tensor * a);
617
-
618
- // rotary position embedding
619
- // in-place, returns view(a)
620
- // if mode == 1, skip n_past elements
621
- // TODO: avoid creating a new tensor every time
622
- struct ggml_tensor * ggml_rope(
623
- struct ggml_context * ctx,
624
- struct ggml_tensor * a,
625
- int n_past,
626
- int n_dims,
627
- int mode);
628
-
629
- // padding = 1
630
- // TODO: we don't support extra parameters for now
631
- // that's why we are hard-coding the stride, padding, and dilation
632
- // not great ..
633
- struct ggml_tensor * ggml_conv_1d_1s(
634
- struct ggml_context * ctx,
635
- struct ggml_tensor * a,
636
- struct ggml_tensor * b);
637
-
638
- struct ggml_tensor * ggml_conv_1d_2s(
639
- struct ggml_context * ctx,
640
- struct ggml_tensor * a,
641
- struct ggml_tensor * b);
642
-
643
- struct ggml_tensor * ggml_flash_attn(
644
- struct ggml_context * ctx,
645
- struct ggml_tensor * q,
646
- struct ggml_tensor * k,
647
- struct ggml_tensor * v,
648
- bool masked);
649
-
650
- struct ggml_tensor * ggml_flash_ff(
651
- struct ggml_context * ctx,
652
- struct ggml_tensor * a,
653
- struct ggml_tensor * b0,
654
- struct ggml_tensor * b1,
655
- struct ggml_tensor * c0,
656
- struct ggml_tensor * c1);
657
-
658
- // Mapping operations
659
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
660
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
661
-
662
- struct ggml_tensor * ggml_map_unary_f32(
663
- struct ggml_context * ctx,
664
- struct ggml_tensor * a,
665
- const ggml_unary_op_f32_t fun);
666
-
667
- struct ggml_tensor * ggml_map_binary_f32(
668
- struct ggml_context * ctx,
669
- struct ggml_tensor * a,
670
- struct ggml_tensor * b,
671
- const ggml_binary_op_f32_t fun);
672
-
673
- //
674
- // automatic differentiation
675
- //
676
-
677
- void ggml_set_param(
678
- struct ggml_context * ctx,
679
- struct ggml_tensor * tensor);
680
-
681
- void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
682
-
683
- struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
684
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
685
-
686
- void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
687
- void ggml_graph_reset (struct ggml_cgraph * cgraph);
688
-
689
- // print info and performance information for the graph
690
- void ggml_graph_print(const struct ggml_cgraph * cgraph);
691
-
692
- // dump the graph into a file using the dot format
693
- void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
694
-
695
- //
696
- // optimization
697
- //
698
-
699
- // optimization methods
700
- enum ggml_opt_type {
701
- GGML_OPT_ADAM,
702
- GGML_OPT_LBFGS,
703
- };
704
-
705
- // linesearch methods
706
- enum ggml_linesearch {
707
- GGML_LINESEARCH_DEFAULT = 1,
708
-
709
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
710
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
711
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
712
- };
713
-
714
- // optimization return values
715
- enum ggml_opt_result {
716
- GGML_OPT_OK = 0,
717
- GGML_OPT_DID_NOT_CONVERGE,
718
- GGML_OPT_NO_CONTEXT,
719
- GGML_OPT_INVALID_WOLFE,
720
- GGML_OPT_FAIL,
721
 
722
- GGML_LINESEARCH_FAIL = -128,
723
- GGML_LINESEARCH_MINIMUM_STEP,
724
- GGML_LINESEARCH_MAXIMUM_STEP,
725
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
726
- GGML_LINESEARCH_INVALID_PARAMETERS,
727
- };
728
 
729
- // optimization parameters
730
- //
731
- // see ggml.c (ggml_opt_default_params) for default values
732
- //
733
- struct ggml_opt_params {
734
- enum ggml_opt_type type;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735
 
736
- int n_threads;
 
 
 
 
 
 
 
737
 
738
- // delta-based convergence test
739
  //
740
- // if past == 0 - disabled
741
- // if past > 0:
742
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
743
  //
744
- int past;
745
- float delta;
746
 
747
- // maximum number of iterations without improvement
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
  //
749
- // if 0 - disabled
750
- // if > 0:
751
- // assume convergence if no cost improvement in this number of iterations
752
  //
753
- int max_no_improvement;
754
 
755
- bool print_forward_graph;
756
- bool print_backward_graph;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
 
758
- // ADAM parameters
759
- struct {
760
- int n_iter;
761
 
762
- float alpha; // learning rate
763
- float beta1;
764
- float beta2;
765
- float eps; // epsilon for numerical stability
766
- float eps_f; // epsilon for convergence test
767
- float eps_g; // epsilon for convergence test
768
- } adam;
769
 
770
- // LBFGS parameters
771
- struct {
772
- int m; // number of corrections to approximate the inv. Hessian
773
- int n_iter;
774
- int max_linesearch;
775
 
776
- float eps; // convergence tolerance
777
- float ftol; // line search tolerance
778
- float wolfe;
779
- float min_step;
780
- float max_step;
781
 
782
- enum ggml_linesearch linesearch;
783
- } lbfgs;
784
- };
785
 
786
- struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
 
787
 
788
- // optimize the function defined by the tensor f
789
- enum ggml_opt_result ggml_opt(
790
- struct ggml_context * ctx,
791
- struct ggml_opt_params params,
792
- struct ggml_tensor * f);
793
 
794
- //
795
- // quantization
796
- //
797
 
798
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
799
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
800
 
801
- //
802
- // system info
803
- //
804
 
805
- int ggml_cpu_has_avx(void);
806
- int ggml_cpu_has_avx2(void);
807
- int ggml_cpu_has_avx512(void);
808
- int ggml_cpu_has_fma(void);
809
- int ggml_cpu_has_neon(void);
810
- int ggml_cpu_has_arm_fma(void);
811
- int ggml_cpu_has_f16c(void);
812
- int ggml_cpu_has_fp16_va(void);
813
- int ggml_cpu_has_wasm_simd(void);
814
- int ggml_cpu_has_blas(void);
815
- int ggml_cpu_has_sse3(void);
816
- int ggml_cpu_has_vsx(void);
817
 
 
818
 
819
- //
820
- // Internal types and functions exposed for tests and benchmarks
821
- //
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822
 
823
  #ifdef __cplusplus
824
- // restrict not standard in C++
825
  #define GGML_RESTRICT
826
  #else
827
  #define GGML_RESTRICT restrict
828
  #endif
829
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
830
- typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
831
- typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
832
-
833
- typedef struct {
834
- dequantize_row_q_t dequantize_row_q;
835
- quantize_row_q_t quantize_row_q;
836
- quantize_row_q_t quantize_row_q_reference;
837
- vec_dot_q_t vec_dot_q;
838
- } quantize_fns_t;
839
-
840
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
 
 
841
 
842
  #ifdef __cplusplus
843
  }
 
169
  //
170
  //
171
 
172
+ #ifdef GGML_SHARED
173
+ # if defined(_WIN32) && !defined(__MINGW32__)
174
+ # ifdef GGML_BUILD
175
+ # define GGML_API __declspec(dllexport)
176
+ # else
177
+ # define GGML_API __declspec(dllimport)
178
+ # endif
179
+ # else
180
+ # define GGML_API __attribute__ ((visibility ("default")))
181
+ # endif
182
+ #else
183
+ # define GGML_API
184
  #endif
185
 
186
  #include <stdint.h>
187
  #include <stddef.h>
188
  #include <stdbool.h>
189
 
190
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
+ #define GGML_FILE_VERSION 1
192
+
193
  #define GGML_MAX_DIMS 4
194
  #define GGML_MAX_NODES 4096
195
  #define GGML_MAX_PARAMS 16
 
197
  #define GGML_MAX_OPT 4
198
  #define GGML_DEFAULT_N_THREADS 4
199
 
200
+ #ifdef __cplusplus
201
+ extern "C" {
202
+ #endif
203
+
204
  #ifdef __ARM_NEON
205
+ // we use the built-in 16-bit float type
206
+ typedef __fp16 ggml_fp16_t;
207
  #else
208
+ typedef uint16_t ggml_fp16_t;
209
  #endif
210
 
211
+ // convert FP16 <-> FP32
212
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
+
215
+ struct ggml_object;
216
+ struct ggml_context;
217
+
218
+ enum ggml_type {
219
+ GGML_TYPE_F32 = 0,
220
+ GGML_TYPE_F16 = 1,
221
+ GGML_TYPE_Q4_0 = 2,
222
+ GGML_TYPE_Q4_1 = 3,
223
+ GGML_TYPE_Q4_2 = 4,
224
+ // GGML_TYPE_Q4_3 (5) support has been removed
225
+ GGML_TYPE_Q5_0 = 6,
226
+ GGML_TYPE_Q5_1 = 7,
227
+ GGML_TYPE_Q8_0 = 8,
228
+ GGML_TYPE_Q8_1 = 9,
229
+ GGML_TYPE_I8,
230
+ GGML_TYPE_I16,
231
+ GGML_TYPE_I32,
232
+ GGML_TYPE_COUNT,
233
+ };
234
+
235
+ // available tensor operations:
236
+ enum ggml_op {
237
+ GGML_OP_NONE = 0,
238
+
239
+ GGML_OP_DUP,
240
+ GGML_OP_ADD,
241
+ GGML_OP_SUB,
242
+ GGML_OP_MUL,
243
+ GGML_OP_DIV,
244
+ GGML_OP_SQR,
245
+ GGML_OP_SQRT,
246
+ GGML_OP_SUM,
247
+ GGML_OP_MEAN,
248
+ GGML_OP_REPEAT,
249
+ GGML_OP_ABS,
250
+ GGML_OP_SGN,
251
+ GGML_OP_NEG,
252
+ GGML_OP_STEP,
253
+ GGML_OP_RELU,
254
+ GGML_OP_GELU,
255
+ GGML_OP_SILU,
256
+ GGML_OP_NORM, // normalize
257
+ GGML_OP_RMS_NORM,
258
+
259
+ GGML_OP_MUL_MAT,
260
+
261
+ GGML_OP_SCALE,
262
+ GGML_OP_CPY,
263
+ GGML_OP_CONT,
264
+ GGML_OP_RESHAPE,
265
+ GGML_OP_VIEW,
266
+ GGML_OP_PERMUTE,
267
+ GGML_OP_TRANSPOSE,
268
+ GGML_OP_GET_ROWS,
269
+ GGML_OP_DIAG_MASK_INF,
270
+ GGML_OP_SOFT_MAX,
271
+ GGML_OP_ROPE,
272
+ GGML_OP_ALIBI,
273
+ GGML_OP_CONV_1D_1S,
274
+ GGML_OP_CONV_1D_2S,
275
+
276
+ GGML_OP_FLASH_ATTN,
277
+ GGML_OP_FLASH_FF,
278
+
279
+ GGML_OP_MAP_UNARY,
280
+ GGML_OP_MAP_BINARY,
281
+
282
+ GGML_OP_COUNT,
283
+ };
284
+
285
+
286
+ // ggml object
287
+ struct ggml_object {
288
+ size_t offs;
289
+ size_t size;
290
+
291
+ struct ggml_object * next;
292
+
293
+ char padding[8];
294
+ };
295
+
296
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
297
+
298
+ // n-dimensional tensor
299
+ struct ggml_tensor {
300
+ enum ggml_type type;
301
+
302
+ int n_dims;
303
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
304
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
305
+ // nb[0] = sizeof(type)
306
+ // nb[1] = nb[0] * ne[0] + padding
307
+ // nb[i] = nb[i-1] * ne[i-1]
308
+
309
+ // compute data
310
+ enum ggml_op op;
311
+
312
+ bool is_param;
313
+
314
+ struct ggml_tensor * grad;
315
+ struct ggml_tensor * src0;
316
+ struct ggml_tensor * src1;
317
+ struct ggml_tensor * opt[GGML_MAX_OPT];
318
+
319
+ // thread scheduling
320
+ int n_tasks;
321
+
322
+ // performance
323
+ int perf_runs;
324
+ int64_t perf_cycles;
325
+ int64_t perf_time_us;
326
+
327
+ void * data;
328
+ char padding[8];
329
+ };
330
+
331
+ // computation graph
332
+ struct ggml_cgraph {
333
+ int n_nodes;
334
+ int n_leafs;
335
+ int n_threads;
336
+
337
+ size_t work_size;
338
+ struct ggml_tensor * work;
339
+
340
+ struct ggml_tensor * nodes[GGML_MAX_NODES];
341
+ struct ggml_tensor * grads[GGML_MAX_NODES];
342
+ struct ggml_tensor * leafs[GGML_MAX_NODES];
343
+
344
+ // performance
345
+ int perf_runs;
346
+ int64_t perf_cycles;
347
+ int64_t perf_time_us;
348
+ };
349
+
350
+ // scratch buffer
351
+ struct ggml_scratch {
352
+ size_t offs;
353
+ size_t size;
354
+ void * data;
355
+ };
 
 
 
 
 
 
 
356
 
357
+ struct ggml_init_params {
358
+ // memory pool
359
+ size_t mem_size; // bytes
360
+ void * mem_buffer; // if NULL, memory will be allocated internally
361
+ bool no_alloc; // don't allocate memory for the tensor data
362
+ };
363
 
364
+ // misc
 
365
 
366
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
367
+ GGML_API int64_t ggml_time_ms(void);
368
+ GGML_API int64_t ggml_time_us(void);
369
+ GGML_API int64_t ggml_cycles(void);
370
+ GGML_API int64_t ggml_cycles_per_ms(void);
371
 
372
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
373
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
374
 
375
+ GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
376
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
377
 
378
+ GGML_API int ggml_blck_size (enum ggml_type type);
379
+ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
380
+ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
 
382
+ GGML_API const char * ggml_type_name(enum ggml_type type);
 
 
 
 
 
383
 
384
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
385
+
386
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
387
+
388
+ // main
389
+
390
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
391
+ GGML_API void ggml_free(struct ggml_context * ctx);
392
+
393
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
394
+
395
+ GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
396
+
397
+ GGML_API struct ggml_tensor * ggml_new_tensor(
398
+ struct ggml_context * ctx,
399
+ enum ggml_type type,
400
+ int n_dims,
401
+ const int64_t *ne);
402
+
403
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
404
+ struct ggml_context * ctx,
405
+ enum ggml_type type,
406
+ int64_t ne0);
407
+
408
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
409
+ struct ggml_context * ctx,
410
+ enum ggml_type type,
411
+ int64_t ne0,
412
+ int64_t ne1);
413
+
414
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
415
+ struct ggml_context * ctx,
416
+ enum ggml_type type,
417
+ int64_t ne0,
418
+ int64_t ne1,
419
+ int64_t ne2);
420
+
421
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
422
+ struct ggml_context * ctx,
423
+ enum ggml_type type,
424
+ int64_t ne0,
425
+ int64_t ne1,
426
+ int64_t ne2,
427
+ int64_t ne3);
428
+
429
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
430
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
431
+
432
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
433
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
434
+
435
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
436
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
437
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
438
 
439
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
440
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
441
+
442
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
443
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
444
+
445
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
446
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
447
 
 
448
  //
449
+ // operations on tensors with backpropagation
 
 
450
  //
 
 
451
 
452
+ GGML_API struct ggml_tensor * ggml_dup(
453
+ struct ggml_context * ctx,
454
+ struct ggml_tensor * a);
455
+
456
+ GGML_API struct ggml_tensor * ggml_add(
457
+ struct ggml_context * ctx,
458
+ struct ggml_tensor * a,
459
+ struct ggml_tensor * b);
460
+
461
+ GGML_API struct ggml_tensor * ggml_add_inplace(
462
+ struct ggml_context * ctx,
463
+ struct ggml_tensor * a,
464
+ struct ggml_tensor * b);
465
+
466
+ GGML_API struct ggml_tensor * ggml_sub(
467
+ struct ggml_context * ctx,
468
+ struct ggml_tensor * a,
469
+ struct ggml_tensor * b);
470
+
471
+ GGML_API struct ggml_tensor * ggml_mul(
472
+ struct ggml_context * ctx,
473
+ struct ggml_tensor * a,
474
+ struct ggml_tensor * b);
475
+
476
+ GGML_API struct ggml_tensor * ggml_div(
477
+ struct ggml_context * ctx,
478
+ struct ggml_tensor * a,
479
+ struct ggml_tensor * b);
480
+
481
+ GGML_API struct ggml_tensor * ggml_sqr(
482
+ struct ggml_context * ctx,
483
+ struct ggml_tensor * a);
484
+
485
+ GGML_API struct ggml_tensor * ggml_sqrt(
486
+ struct ggml_context * ctx,
487
+ struct ggml_tensor * a);
488
+
489
+ // return scalar
490
+ // TODO: compute sum along rows
491
+ GGML_API struct ggml_tensor * ggml_sum(
492
+ struct ggml_context * ctx,
493
+ struct ggml_tensor * a);
494
+
495
+ // mean along rows
496
+ GGML_API struct ggml_tensor * ggml_mean(
497
+ struct ggml_context * ctx,
498
+ struct ggml_tensor * a);
499
+
500
+ // if a is the same shape as b, and a is not parameter, return a
501
+ // otherwise, return a new tensor: repeat(a) to fit in b
502
+ GGML_API struct ggml_tensor * ggml_repeat(
503
+ struct ggml_context * ctx,
504
+ struct ggml_tensor * a,
505
+ struct ggml_tensor * b);
506
+
507
+ GGML_API struct ggml_tensor * ggml_abs(
508
+ struct ggml_context * ctx,
509
+ struct ggml_tensor * a);
510
+
511
+ GGML_API struct ggml_tensor * ggml_sgn(
512
+ struct ggml_context * ctx,
513
+ struct ggml_tensor * a);
514
+
515
+ GGML_API struct ggml_tensor * ggml_neg(
516
+ struct ggml_context * ctx,
517
+ struct ggml_tensor * a);
518
+
519
+ GGML_API struct ggml_tensor * ggml_step(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a);
522
+
523
+ GGML_API struct ggml_tensor * ggml_relu(
524
+ struct ggml_context * ctx,
525
+ struct ggml_tensor * a);
526
+
527
+ // TODO: double-check this computation is correct
528
+ GGML_API struct ggml_tensor * ggml_gelu(
529
+ struct ggml_context * ctx,
530
+ struct ggml_tensor * a);
531
+
532
+ GGML_API struct ggml_tensor * ggml_silu(
533
+ struct ggml_context * ctx,
534
+ struct ggml_tensor * a);
535
+
536
+ // normalize along rows
537
+ // TODO: eps is hardcoded to 1e-5 for now
538
+ GGML_API struct ggml_tensor * ggml_norm(
539
+ struct ggml_context * ctx,
540
+ struct ggml_tensor * a);
541
+
542
+ GGML_API struct ggml_tensor * ggml_rms_norm(
543
+ struct ggml_context * ctx,
544
+ struct ggml_tensor * a);
545
+
546
+ // A: m rows, n columns
547
+ // B: p rows, n columns (i.e. we transpose it internally)
548
+ // result is m columns, p rows
549
+ GGML_API struct ggml_tensor * ggml_mul_mat(
550
+ struct ggml_context * ctx,
551
+ struct ggml_tensor * a,
552
+ struct ggml_tensor * b);
553
+
554
  //
555
+ // operations on tensors without backpropagation
 
 
556
  //
 
557
 
558
+ // in-place, returns view(a)
559
+ GGML_API struct ggml_tensor * ggml_scale(
560
+ struct ggml_context * ctx,
561
+ struct ggml_tensor * a,
562
+ struct ggml_tensor * b);
563
+
564
+ // a -> b, return view(b)
565
+ GGML_API struct ggml_tensor * ggml_cpy(
566
+ struct ggml_context * ctx,
567
+ struct ggml_tensor * a,
568
+ struct ggml_tensor * b);
569
+
570
+ // make contiguous
571
+ GGML_API struct ggml_tensor * ggml_cont(
572
+ struct ggml_context * ctx,
573
+ struct ggml_tensor * a);
574
+
575
+ // return view(a), b specifies the new shape
576
+ // TODO: when we start computing gradient, make a copy instead of view
577
+ GGML_API struct ggml_tensor * ggml_reshape(
578
+ struct ggml_context * ctx,
579
+ struct ggml_tensor * a,
580
+ struct ggml_tensor * b);
581
+
582
+ // return view(a)
583
+ // TODO: when we start computing gradient, make a copy instead of view
584
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
585
+ struct ggml_context * ctx,
586
+ struct ggml_tensor * a,
587
+ int64_t ne0,
588
+ int64_t ne1);
589
+
590
+ // return view(a)
591
+ // TODO: when we start computing gradient, make a copy instead of view
592
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
593
+ struct ggml_context * ctx,
594
+ struct ggml_tensor * a,
595
+ int64_t ne0,
596
+ int64_t ne1,
597
+ int64_t ne2);
598
+
599
+ // offset in bytes
600
+ GGML_API struct ggml_tensor * ggml_view_1d(
601
+ struct ggml_context * ctx,
602
+ struct ggml_tensor * a,
603
+ int64_t ne0,
604
+ size_t offset);
605
+
606
+ GGML_API struct ggml_tensor * ggml_view_2d(
607
+ struct ggml_context * ctx,
608
+ struct ggml_tensor * a,
609
+ int64_t ne0,
610
+ int64_t ne1,
611
+ size_t nb1, // row stride in bytes
612
+ size_t offset);
613
+
614
+ GGML_API struct ggml_tensor * ggml_view_3d(
615
+ struct ggml_context * ctx,
616
+ struct ggml_tensor * a,
617
+ int64_t ne0,
618
+ int64_t ne1,
619
+ int64_t ne2,
620
+ size_t nb1, // row stride in bytes
621
+ size_t nb2, // slice stride in bytes
622
+ size_t offset);
623
+
624
+ GGML_API struct ggml_tensor * ggml_permute(
625
+ struct ggml_context * ctx,
626
+ struct ggml_tensor * a,
627
+ int axis0,
628
+ int axis1,
629
+ int axis2,
630
+ int axis3);
631
+
632
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
633
+ GGML_API struct ggml_tensor * ggml_transpose(
634
+ struct ggml_context * ctx,
635
+ struct ggml_tensor * a);
636
+
637
+ GGML_API struct ggml_tensor * ggml_get_rows(
638
+ struct ggml_context * ctx,
639
+ struct ggml_tensor * a,
640
+ struct ggml_tensor * b);
641
+
642
+ // set elements above the diagonal to -INF
643
+ // in-place, returns view(a)
644
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
645
+ struct ggml_context * ctx,
646
+ struct ggml_tensor * a,
647
+ int n_past);
648
+
649
+ // in-place, returns view(a)
650
+ GGML_API struct ggml_tensor * ggml_soft_max(
651
+ struct ggml_context * ctx,
652
+ struct ggml_tensor * a);
653
+
654
+ // rotary position embedding
655
+ // in-place, returns view(a)
656
+ // if mode & 1 == 1, skip n_past elements
657
+ // if mode & 2 == 1, GPT-NeoX style
658
+ // TODO: avoid creating a new tensor every time
659
+ GGML_API struct ggml_tensor * ggml_rope(
660
+ struct ggml_context * ctx,
661
+ struct ggml_tensor * a,
662
+ int n_past,
663
+ int n_dims,
664
+ int mode);
665
+
666
+ // alibi position embedding
667
+ // in-place, returns view(a)
668
+ struct ggml_tensor * ggml_alibi(
669
+ struct ggml_context * ctx,
670
+ struct ggml_tensor * a,
671
+ int n_past,
672
+ int n_head);
673
+
674
+ // padding = 1
675
+ // TODO: we don't support extra parameters for now
676
+ // that's why we are hard-coding the stride, padding, and dilation
677
+ // not great ..
678
+ GGML_API struct ggml_tensor * ggml_conv_1d_1s(
679
+ struct ggml_context * ctx,
680
+ struct ggml_tensor * a,
681
+ struct ggml_tensor * b);
682
+
683
+ GGML_API struct ggml_tensor * ggml_conv_1d_2s(
684
+ struct ggml_context * ctx,
685
+ struct ggml_tensor * a,
686
+ struct ggml_tensor * b);
687
+
688
+ GGML_API struct ggml_tensor * ggml_flash_attn(
689
+ struct ggml_context * ctx,
690
+ struct ggml_tensor * q,
691
+ struct ggml_tensor * k,
692
+ struct ggml_tensor * v,
693
+ bool masked);
694
+
695
+ GGML_API struct ggml_tensor * ggml_flash_ff(
696
+ struct ggml_context * ctx,
697
+ struct ggml_tensor * a,
698
+ struct ggml_tensor * b0,
699
+ struct ggml_tensor * b1,
700
+ struct ggml_tensor * c0,
701
+ struct ggml_tensor * c1);
702
+
703
+ // Mapping operations
704
+ GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
705
+ GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
706
+
707
+ GGML_API struct ggml_tensor * ggml_map_unary_f32(
708
+ struct ggml_context * ctx,
709
+ struct ggml_tensor * a,
710
+ const ggml_unary_op_f32_t fun);
711
+
712
+ GGML_API struct ggml_tensor * ggml_map_binary_f32(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b,
716
+ const ggml_binary_op_f32_t fun);
717
 
718
+ //
719
+ // automatic differentiation
720
+ //
721
 
722
+ GGML_API void ggml_set_param(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * tensor);
 
 
 
 
725
 
726
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
 
 
 
727
 
728
+ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
729
+ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
 
 
730
 
731
+ GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
732
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
 
733
 
734
+ // print info and performance information for the graph
735
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
736
 
737
+ // dump the graph into a file using the dot format
738
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
 
 
 
739
 
740
+ //
741
+ // optimization
742
+ //
743
 
744
+ // optimization methods
745
+ enum ggml_opt_type {
746
+ GGML_OPT_ADAM,
747
+ GGML_OPT_LBFGS,
748
+ };
749
+
750
+ // linesearch methods
751
+ enum ggml_linesearch {
752
+ GGML_LINESEARCH_DEFAULT = 1,
753
+
754
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
755
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
756
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
757
+ };
758
+
759
+ // optimization return values
760
+ enum ggml_opt_result {
761
+ GGML_OPT_OK = 0,
762
+ GGML_OPT_DID_NOT_CONVERGE,
763
+ GGML_OPT_NO_CONTEXT,
764
+ GGML_OPT_INVALID_WOLFE,
765
+ GGML_OPT_FAIL,
766
+
767
+ GGML_LINESEARCH_FAIL = -128,
768
+ GGML_LINESEARCH_MINIMUM_STEP,
769
+ GGML_LINESEARCH_MAXIMUM_STEP,
770
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
771
+ GGML_LINESEARCH_INVALID_PARAMETERS,
772
+ };
773
+
774
+ // optimization parameters
775
+ //
776
+ // see ggml.c (ggml_opt_default_params) for default values
777
+ //
778
+ struct ggml_opt_params {
779
+ enum ggml_opt_type type;
780
+
781
+ int n_threads;
782
+
783
+ // delta-based convergence test
784
+ //
785
+ // if past == 0 - disabled
786
+ // if past > 0:
787
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
788
+ //
789
+ int past;
790
+ float delta;
791
+
792
+ // maximum number of iterations without improvement
793
+ //
794
+ // if 0 - disabled
795
+ // if > 0:
796
+ // assume convergence if no cost improvement in this number of iterations
797
+ //
798
+ int max_no_improvement;
799
+
800
+ bool print_forward_graph;
801
+ bool print_backward_graph;
802
+
803
+ // ADAM parameters
804
+ struct {
805
+ int n_iter;
806
+
807
+ float alpha; // learning rate
808
+ float beta1;
809
+ float beta2;
810
+ float eps; // epsilon for numerical stability
811
+ float eps_f; // epsilon for convergence test
812
+ float eps_g; // epsilon for convergence test
813
+ } adam;
814
+
815
+ // LBFGS parameters
816
+ struct {
817
+ int m; // number of corrections to approximate the inv. Hessian
818
+ int n_iter;
819
+ int max_linesearch;
820
+
821
+ float eps; // convergence tolerance
822
+ float ftol; // line search tolerance
823
+ float wolfe;
824
+ float min_step;
825
+ float max_step;
826
+
827
+ enum ggml_linesearch linesearch;
828
+ } lbfgs;
829
+ };
830
+
831
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
832
+
833
+ // optimize the function defined by the tensor f
834
+ GGML_API enum ggml_opt_result ggml_opt(
835
+ struct ggml_context * ctx,
836
+ struct ggml_opt_params params,
837
+ struct ggml_tensor * f);
838
 
839
+ //
840
+ // quantization
841
+ //
842
 
843
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
844
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
845
+ GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
846
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
847
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
848
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
 
 
 
 
 
 
849
 
850
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
851
 
852
+ //
853
+ // system info
854
+ //
855
+
856
+ GGML_API int ggml_cpu_has_avx (void);
857
+ GGML_API int ggml_cpu_has_avx2 (void);
858
+ GGML_API int ggml_cpu_has_avx512 (void);
859
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
860
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
861
+ GGML_API int ggml_cpu_has_fma (void);
862
+ GGML_API int ggml_cpu_has_neon (void);
863
+ GGML_API int ggml_cpu_has_arm_fma (void);
864
+ GGML_API int ggml_cpu_has_f16c (void);
865
+ GGML_API int ggml_cpu_has_fp16_va (void);
866
+ GGML_API int ggml_cpu_has_wasm_simd (void);
867
+ GGML_API int ggml_cpu_has_blas (void);
868
+ GGML_API int ggml_cpu_has_cublas (void);
869
+ GGML_API int ggml_cpu_has_clblast (void);
870
+ GGML_API int ggml_cpu_has_gpublas (void);
871
+ GGML_API int ggml_cpu_has_sse3 (void);
872
+ GGML_API int ggml_cpu_has_vsx (void);
873
+
874
+ //
875
+ // Internal types and functions exposed for tests and benchmarks
876
+ //
877
 
878
  #ifdef __cplusplus
879
+ // restrict not standard in C++
880
  #define GGML_RESTRICT
881
  #else
882
  #define GGML_RESTRICT restrict
883
  #endif
884
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
885
+ typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
886
+ typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
887
+
888
+ typedef struct {
889
+ dequantize_row_q_t dequantize_row_q;
890
+ quantize_row_q_t quantize_row_q;
891
+ quantize_row_q_t quantize_row_q_reference;
892
+ quantize_row_q_t quantize_row_q_dot;
893
+ vec_dot_q_t vec_dot_q;
894
+ enum ggml_type vec_dot_type;
895
+ } quantize_fns_t;
896
+
897
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
898
 
899
  #ifdef __cplusplus
900
  }