ggerganov commited on
Commit
bc567d3
·
unverified ·
1 Parent(s): d75e158

ggml : always define ggml_fp16_t as uint16_t (llama/5666)

Browse files

* ggml : always define ggml_fp16_t as uint16_t

ggml-ci

* ggml : cont

ggml-ci

* ggml : cont

* ggml : cont

ggml-ci

* ggml : cont

ggml-ci

* cuda : no longer ggml headers last

ggml-ci

* ggml : fix q6_K FP16 -> FP32 conversion

ggml-ci

* ggml : more FP16 -> FP32 conversion fixes

ggml-ci

Files changed (5) hide show
  1. ggml-cuda.cu +4 -5
  2. ggml-impl.h +20 -7
  3. ggml-quants.c +15 -15
  4. ggml.c +3 -3
  5. ggml.h +0 -6
ggml-cuda.cu CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  #include <algorithm>
2
  #include <assert.h>
3
  #include <atomic>
@@ -121,11 +125,6 @@
121
 
122
  #endif // defined(GGML_USE_HIPBLAS)
123
 
124
- // ggml-cuda need half type so keep ggml headers include at last
125
- #include "ggml-cuda.h"
126
- #include "ggml.h"
127
- #include "ggml-backend-impl.h"
128
-
129
  #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
130
 
131
  #define CC_PASCAL 600
 
1
+ #include "ggml-cuda.h"
2
+ #include "ggml.h"
3
+ #include "ggml-backend-impl.h"
4
+
5
  #include <algorithm>
6
  #include <assert.h>
7
  #include <atomic>
 
125
 
126
  #endif // defined(GGML_USE_HIPBLAS)
127
 
 
 
 
 
 
128
  #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
129
 
130
  #define CC_PASCAL 600
ggml-impl.h CHANGED
@@ -53,11 +53,23 @@ extern "C" {
53
  //
54
  #include <arm_neon.h>
55
 
56
- #define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
57
- #define GGML_COMPUTE_FP32_TO_FP16(x) (x)
 
 
 
 
 
 
 
 
58
 
59
- #define GGML_FP16_TO_FP32(x) ((float) (x))
60
- #define GGML_FP32_TO_FP16(x) (x)
 
 
 
 
61
 
62
  #else
63
 
@@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
214
  // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
215
  // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
216
  // This is also true for POWER9.
217
- #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
218
-
219
  inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
220
  uint16_t s;
221
  memcpy(&s, &f, sizeof(uint16_t));
@@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
223
  }
224
 
225
  #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
226
- #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
227
 
 
 
228
  #endif
229
 
230
  #define GGML_HASHTABLE_FULL ((size_t)-1)
 
53
  //
54
  #include <arm_neon.h>
55
 
56
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
57
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
58
+
59
+ #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
60
+
61
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
62
+ __fp16 tmp;
63
+ memcpy(&tmp, &h, sizeof(ggml_fp16_t));
64
+ return (float)tmp;
65
+ }
66
 
67
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
68
+ ggml_fp16_t res;
69
+ __fp16 tmp = f;
70
+ memcpy(&res, &tmp, sizeof(ggml_fp16_t));
71
+ return res;
72
+ }
73
 
74
  #else
75
 
 
226
  // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
227
  // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
228
  // This is also true for POWER9.
229
+ #if !defined(GGML_FP16_TO_FP32)
 
230
  inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
231
  uint16_t s;
232
  memcpy(&s, &f, sizeof(uint16_t));
 
234
  }
235
 
236
  #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
237
+ #endif
238
 
239
+ #if !defined(GGML_FP32_TO_FP16)
240
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
241
  #endif
242
 
243
  #define GGML_HASHTABLE_FULL ((size_t)-1)
ggml-quants.c CHANGED
@@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5654
 
5655
  for (int i = 0; i < nb; ++i) {
5656
 
5657
- const float d = y[i].d * (float)x[i].d;
5658
- const float dmin = -y[i].d * (float)x[i].dmin;
5659
 
5660
  const uint8_t * restrict q2 = x[i].qs;
5661
  const int8_t * restrict q8 = y[i].qs;
@@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5804
 
5805
  for (int i = 0; i < nb; ++i) {
5806
 
5807
- const float d = y[i].d * (float)x[i].d;
5808
- const float dmin = -y[i].d * (float)x[i].dmin;
5809
 
5810
  const uint8_t * restrict q2 = x[i].qs;
5811
  const int8_t * restrict q8 = y[i].qs;
@@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6458
 
6459
  int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
6460
 
6461
- const float d = y[i].d * (float)x[i].d;
6462
 
6463
  const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
6464
  q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
@@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6660
 
6661
  int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
6662
 
6663
- const float d = y[i].d * (float)x[i].d;
6664
 
6665
  vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
6666
 
@@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7163
  aux16[1] = (a[0] >> 4) & 0x0f0f;
7164
 
7165
  const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
7166
- sum_mins += y[i].d * (float)x[i].d[1] * summi;
7167
 
7168
- const float d = y[i].d * (float)x[i].d[0];
7169
 
7170
  const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
7171
 
@@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7823
 
7824
  for (int i = 0; i < nb; ++i) {
7825
 
7826
- const float d = y[i].d * (float)x[i].d;
7827
  const int8_t * sc = x[i].scales;
7828
 
7829
  const uint8_t * restrict q5 = x[i].qs;
@@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7965
 
7966
  for (int i = 0; i < nb; ++i) {
7967
 
7968
- const float d = y[i].d * (float)x[i].d;
7969
  const int8_t * sc = x[i].scales;
7970
 
7971
  const uint8_t * restrict q5 = x[i].qs;
@@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8533
 
8534
  for (int i = 0; i < nb; ++i) {
8535
 
8536
- const float d_all = (float)x[i].d;
8537
 
8538
  const uint8_t * restrict q6 = x[i].ql;
8539
  const uint8_t * restrict qh = x[i].qh;
@@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8704
 
8705
  for (int i = 0; i < nb; ++i) {
8706
 
8707
- const float d_all = (float)x[i].d;
8708
 
8709
  const uint8_t * restrict q6 = x[i].ql;
8710
  const uint8_t * restrict qh = x[i].qh;
@@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9523
  float sumf = 0;
9524
 
9525
  for (int ib = 0; ib < nb; ib += 2) {
9526
-
9527
  q4bits.val[0] = vld1q_u8(x[ib+0].qs);
9528
  q4bits.val[1] = vld1q_u8(x[ib+1].qs);
9529
  q8b.val[0] = vld1q_s8(y[ib+0].qs);
@@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9539
  prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
9540
  prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
9541
 
9542
- sumf += (float)x[ib+0].d * (float)y[ib+0].d * vaddvq_s32(prod_1) + (float)x[ib+1].d * (float)y[ib+1].d * vaddvq_s32(prod_2);
9543
-
 
9544
  }
9545
 
9546
  *s = sumf;
 
5654
 
5655
  for (int i = 0; i < nb; ++i) {
5656
 
5657
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5658
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5659
 
5660
  const uint8_t * restrict q2 = x[i].qs;
5661
  const int8_t * restrict q8 = y[i].qs;
 
5804
 
5805
  for (int i = 0; i < nb; ++i) {
5806
 
5807
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
5808
+ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
5809
 
5810
  const uint8_t * restrict q2 = x[i].qs;
5811
  const int8_t * restrict q8 = y[i].qs;
 
6458
 
6459
  int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
6460
 
6461
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6462
 
6463
  const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
6464
  q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
 
6660
 
6661
  int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
6662
 
6663
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6664
 
6665
  vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
6666
 
 
7163
  aux16[1] = (a[0] >> 4) & 0x0f0f;
7164
 
7165
  const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
7166
+ sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;
7167
 
7168
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
7169
 
7170
  const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
7171
 
 
7823
 
7824
  for (int i = 0; i < nb; ++i) {
7825
 
7826
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7827
  const int8_t * sc = x[i].scales;
7828
 
7829
  const uint8_t * restrict q5 = x[i].qs;
 
7965
 
7966
  for (int i = 0; i < nb; ++i) {
7967
 
7968
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
7969
  const int8_t * sc = x[i].scales;
7970
 
7971
  const uint8_t * restrict q5 = x[i].qs;
 
8533
 
8534
  for (int i = 0; i < nb; ++i) {
8535
 
8536
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
8537
 
8538
  const uint8_t * restrict q6 = x[i].ql;
8539
  const uint8_t * restrict qh = x[i].qh;
 
8704
 
8705
  for (int i = 0; i < nb; ++i) {
8706
 
8707
+ const float d_all = GGML_FP16_TO_FP32(x[i].d);
8708
 
8709
  const uint8_t * restrict q6 = x[i].ql;
8710
  const uint8_t * restrict qh = x[i].qh;
 
9523
  float sumf = 0;
9524
 
9525
  for (int ib = 0; ib < nb; ib += 2) {
 
9526
  q4bits.val[0] = vld1q_u8(x[ib+0].qs);
9527
  q4bits.val[1] = vld1q_u8(x[ib+1].qs);
9528
  q8b.val[0] = vld1q_s8(y[ib+0].qs);
 
9538
  prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
9539
  prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
9540
 
9541
+ sumf +=
9542
+ GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
9543
+ GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
9544
  }
9545
 
9546
  *s = sumf;
ggml.c CHANGED
@@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
323
  // note: do not use these inside ggml.c
324
  // these are meant to be used via the ggml.h API
325
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
326
- return (float) GGML_FP16_TO_FP32(x);
327
  }
328
 
329
  ggml_fp16_t ggml_fp32_to_fp16(float x) {
@@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
798
  #define GGML_F16x8 float16x8_t
799
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
800
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
801
- #define GGML_F16x8_LOAD vld1q_f16
802
  #define GGML_F16x8_STORE vst1q_f16
803
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
804
  #define GGML_F16x8_ADD vaddq_f16
@@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
841
  #define GGML_F32Cx4 float32x4_t
842
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
843
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
844
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
845
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
846
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
847
  #define GGML_F32Cx4_ADD vaddq_f32
 
323
  // note: do not use these inside ggml.c
324
  // these are meant to be used via the ggml.h API
325
  float ggml_fp16_to_fp32(ggml_fp16_t x) {
326
+ return GGML_FP16_TO_FP32(x);
327
  }
328
 
329
  ggml_fp16_t ggml_fp32_to_fp16(float x) {
 
798
  #define GGML_F16x8 float16x8_t
799
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
800
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
801
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
802
  #define GGML_F16x8_STORE vst1q_f16
803
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
804
  #define GGML_F16x8_ADD vaddq_f16
 
841
  #define GGML_F32Cx4 float32x4_t
842
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
843
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
844
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
845
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
846
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
847
  #define GGML_F32Cx4_ADD vaddq_f32
ggml.h CHANGED
@@ -315,13 +315,7 @@
315
  extern "C" {
316
  #endif
317
 
318
- #if defined(__ARM_NEON) && defined(__CUDACC__)
319
- typedef half ggml_fp16_t;
320
- #elif defined(__ARM_NEON) && !defined(_MSC_VER)
321
- typedef __fp16 ggml_fp16_t;
322
- #else
323
  typedef uint16_t ggml_fp16_t;
324
- #endif
325
 
326
  // convert FP16 <-> FP32
327
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
 
315
  extern "C" {
316
  #endif
317
 
 
 
 
 
 
318
  typedef uint16_t ggml_fp16_t;
 
319
 
320
  // convert FP16 <-> FP32
321
  GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);