Spaces:
Running
Running
ggml : always define ggml_fp16_t as uint16_t (llama/5666)
Browse files* ggml : always define ggml_fp16_t as uint16_t
ggml-ci
* ggml : cont
ggml-ci
* ggml : cont
* ggml : cont
ggml-ci
* ggml : cont
ggml-ci
* cuda : no longer ggml headers last
ggml-ci
* ggml : fix q6_K FP16 -> FP32 conversion
ggml-ci
* ggml : more FP16 -> FP32 conversion fixes
ggml-ci
- ggml-cuda.cu +4 -5
- ggml-impl.h +20 -7
- ggml-quants.c +15 -15
- ggml.c +3 -3
- ggml.h +0 -6
ggml-cuda.cu
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#include <algorithm>
|
| 2 |
#include <assert.h>
|
| 3 |
#include <atomic>
|
|
@@ -121,11 +125,6 @@
|
|
| 121 |
|
| 122 |
#endif // defined(GGML_USE_HIPBLAS)
|
| 123 |
|
| 124 |
-
// ggml-cuda need half type so keep ggml headers include at last
|
| 125 |
-
#include "ggml-cuda.h"
|
| 126 |
-
#include "ggml.h"
|
| 127 |
-
#include "ggml-backend-impl.h"
|
| 128 |
-
|
| 129 |
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
| 130 |
|
| 131 |
#define CC_PASCAL 600
|
|
|
|
| 1 |
+
#include "ggml-cuda.h"
|
| 2 |
+
#include "ggml.h"
|
| 3 |
+
#include "ggml-backend-impl.h"
|
| 4 |
+
|
| 5 |
#include <algorithm>
|
| 6 |
#include <assert.h>
|
| 7 |
#include <atomic>
|
|
|
|
| 125 |
|
| 126 |
#endif // defined(GGML_USE_HIPBLAS)
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
| 129 |
|
| 130 |
#define CC_PASCAL 600
|
ggml-impl.h
CHANGED
|
@@ -53,11 +53,23 @@ extern "C" {
|
|
| 53 |
//
|
| 54 |
#include <arm_neon.h>
|
| 55 |
|
| 56 |
-
#define GGML_COMPUTE_FP16_TO_FP32(x) (
|
| 57 |
-
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
#else
|
| 63 |
|
|
@@ -214,8 +226,7 @@ extern float ggml_table_f32_f16[1 << 16];
|
|
| 214 |
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
| 215 |
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
| 216 |
// This is also true for POWER9.
|
| 217 |
-
#if !defined(GGML_FP16_TO_FP32)
|
| 218 |
-
|
| 219 |
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
| 220 |
uint16_t s;
|
| 221 |
memcpy(&s, &f, sizeof(uint16_t));
|
|
@@ -223,8 +234,10 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
| 223 |
}
|
| 224 |
|
| 225 |
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
| 226 |
-
#
|
| 227 |
|
|
|
|
|
|
|
| 228 |
#endif
|
| 229 |
|
| 230 |
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
|
|
|
| 53 |
//
|
| 54 |
#include <arm_neon.h>
|
| 55 |
|
| 56 |
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 57 |
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
| 58 |
+
|
| 59 |
+
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 60 |
+
|
| 61 |
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
| 62 |
+
__fp16 tmp;
|
| 63 |
+
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
| 64 |
+
return (float)tmp;
|
| 65 |
+
}
|
| 66 |
|
| 67 |
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
| 68 |
+
ggml_fp16_t res;
|
| 69 |
+
__fp16 tmp = f;
|
| 70 |
+
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
| 71 |
+
return res;
|
| 72 |
+
}
|
| 73 |
|
| 74 |
#else
|
| 75 |
|
|
|
|
| 226 |
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
| 227 |
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
| 228 |
// This is also true for POWER9.
|
| 229 |
+
#if !defined(GGML_FP16_TO_FP32)
|
|
|
|
| 230 |
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
| 231 |
uint16_t s;
|
| 232 |
memcpy(&s, &f, sizeof(uint16_t));
|
|
|
|
| 234 |
}
|
| 235 |
|
| 236 |
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
| 237 |
+
#endif
|
| 238 |
|
| 239 |
+
#if !defined(GGML_FP32_TO_FP16)
|
| 240 |
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
| 241 |
#endif
|
| 242 |
|
| 243 |
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
ggml-quants.c
CHANGED
|
@@ -5654,8 +5654,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 5654 |
|
| 5655 |
for (int i = 0; i < nb; ++i) {
|
| 5656 |
|
| 5657 |
-
const float d
|
| 5658 |
-
const float dmin = -y[i].d * (
|
| 5659 |
|
| 5660 |
const uint8_t * restrict q2 = x[i].qs;
|
| 5661 |
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -5804,8 +5804,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 5804 |
|
| 5805 |
for (int i = 0; i < nb; ++i) {
|
| 5806 |
|
| 5807 |
-
const float d
|
| 5808 |
-
const float dmin = -y[i].d * (
|
| 5809 |
|
| 5810 |
const uint8_t * restrict q2 = x[i].qs;
|
| 5811 |
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -6458,7 +6458,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6458 |
|
| 6459 |
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
| 6460 |
|
| 6461 |
-
const float d = y[i].d * (
|
| 6462 |
|
| 6463 |
const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
|
| 6464 |
q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
|
|
@@ -6660,7 +6660,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6660 |
|
| 6661 |
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
| 6662 |
|
| 6663 |
-
const float d = y[i].d * (
|
| 6664 |
|
| 6665 |
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
| 6666 |
|
|
@@ -7163,9 +7163,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7163 |
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
| 7164 |
|
| 7165 |
const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
|
| 7166 |
-
sum_mins += y[i].d * (
|
| 7167 |
|
| 7168 |
-
const float d = y[i].d * (
|
| 7169 |
|
| 7170 |
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
| 7171 |
|
|
@@ -7823,7 +7823,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7823 |
|
| 7824 |
for (int i = 0; i < nb; ++i) {
|
| 7825 |
|
| 7826 |
-
const float d = y[i].d * (
|
| 7827 |
const int8_t * sc = x[i].scales;
|
| 7828 |
|
| 7829 |
const uint8_t * restrict q5 = x[i].qs;
|
|
@@ -7965,7 +7965,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7965 |
|
| 7966 |
for (int i = 0; i < nb; ++i) {
|
| 7967 |
|
| 7968 |
-
const float d = y[i].d * (
|
| 7969 |
const int8_t * sc = x[i].scales;
|
| 7970 |
|
| 7971 |
const uint8_t * restrict q5 = x[i].qs;
|
|
@@ -8533,7 +8533,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8533 |
|
| 8534 |
for (int i = 0; i < nb; ++i) {
|
| 8535 |
|
| 8536 |
-
const float d_all = (
|
| 8537 |
|
| 8538 |
const uint8_t * restrict q6 = x[i].ql;
|
| 8539 |
const uint8_t * restrict qh = x[i].qh;
|
|
@@ -8704,7 +8704,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8704 |
|
| 8705 |
for (int i = 0; i < nb; ++i) {
|
| 8706 |
|
| 8707 |
-
const float d_all = (
|
| 8708 |
|
| 8709 |
const uint8_t * restrict q6 = x[i].ql;
|
| 8710 |
const uint8_t * restrict qh = x[i].qh;
|
|
@@ -9523,7 +9523,6 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 9523 |
float sumf = 0;
|
| 9524 |
|
| 9525 |
for (int ib = 0; ib < nb; ib += 2) {
|
| 9526 |
-
|
| 9527 |
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
| 9528 |
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
| 9529 |
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
|
@@ -9539,8 +9538,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 9539 |
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
| 9540 |
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
| 9541 |
|
| 9542 |
-
sumf +=
|
| 9543 |
-
|
|
|
|
| 9544 |
}
|
| 9545 |
|
| 9546 |
*s = sumf;
|
|
|
|
| 5654 |
|
| 5655 |
for (int i = 0; i < nb; ++i) {
|
| 5656 |
|
| 5657 |
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 5658 |
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
| 5659 |
|
| 5660 |
const uint8_t * restrict q2 = x[i].qs;
|
| 5661 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 5804 |
|
| 5805 |
for (int i = 0; i < nb; ++i) {
|
| 5806 |
|
| 5807 |
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 5808 |
+
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
| 5809 |
|
| 5810 |
const uint8_t * restrict q2 = x[i].qs;
|
| 5811 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 6458 |
|
| 6459 |
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
| 6460 |
|
| 6461 |
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 6462 |
|
| 6463 |
const uint8x16_t htmp = vcombine_u8(hbits, vshr_n_u8(hbits, 1));
|
| 6464 |
q3h.val[0] = vandq_u8(mh, vshlq_n_u8(htmp, 2));
|
|
|
|
| 6660 |
|
| 6661 |
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
| 6662 |
|
| 6663 |
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 6664 |
|
| 6665 |
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
| 6666 |
|
|
|
|
| 7163 |
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
| 7164 |
|
| 7165 |
const int32_t summi = scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]);
|
| 7166 |
+
sum_mins += y[i].d * GGML_FP16_TO_FP32(x[i].d[1]) * summi;
|
| 7167 |
|
| 7168 |
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d[0]);
|
| 7169 |
|
| 7170 |
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
| 7171 |
|
|
|
|
| 7823 |
|
| 7824 |
for (int i = 0; i < nb; ++i) {
|
| 7825 |
|
| 7826 |
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 7827 |
const int8_t * sc = x[i].scales;
|
| 7828 |
|
| 7829 |
const uint8_t * restrict q5 = x[i].qs;
|
|
|
|
| 7965 |
|
| 7966 |
for (int i = 0; i < nb; ++i) {
|
| 7967 |
|
| 7968 |
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 7969 |
const int8_t * sc = x[i].scales;
|
| 7970 |
|
| 7971 |
const uint8_t * restrict q5 = x[i].qs;
|
|
|
|
| 8533 |
|
| 8534 |
for (int i = 0; i < nb; ++i) {
|
| 8535 |
|
| 8536 |
+
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
| 8537 |
|
| 8538 |
const uint8_t * restrict q6 = x[i].ql;
|
| 8539 |
const uint8_t * restrict qh = x[i].qh;
|
|
|
|
| 8704 |
|
| 8705 |
for (int i = 0; i < nb; ++i) {
|
| 8706 |
|
| 8707 |
+
const float d_all = GGML_FP16_TO_FP32(x[i].d);
|
| 8708 |
|
| 8709 |
const uint8_t * restrict q6 = x[i].ql;
|
| 8710 |
const uint8_t * restrict qh = x[i].qh;
|
|
|
|
| 9523 |
float sumf = 0;
|
| 9524 |
|
| 9525 |
for (int ib = 0; ib < nb; ib += 2) {
|
|
|
|
| 9526 |
q4bits.val[0] = vld1q_u8(x[ib+0].qs);
|
| 9527 |
q4bits.val[1] = vld1q_u8(x[ib+1].qs);
|
| 9528 |
q8b.val[0] = vld1q_s8(y[ib+0].qs);
|
|
|
|
| 9538 |
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
| 9539 |
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
| 9540 |
|
| 9541 |
+
sumf +=
|
| 9542 |
+
GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib+0].d) * vaddvq_s32(prod_1) +
|
| 9543 |
+
GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib+1].d) * vaddvq_s32(prod_2);
|
| 9544 |
}
|
| 9545 |
|
| 9546 |
*s = sumf;
|
ggml.c
CHANGED
|
@@ -323,7 +323,7 @@ float ggml_table_f32_f16[1 << 16];
|
|
| 323 |
// note: do not use these inside ggml.c
|
| 324 |
// these are meant to be used via the ggml.h API
|
| 325 |
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
| 326 |
-
return
|
| 327 |
}
|
| 328 |
|
| 329 |
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
@@ -798,7 +798,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
| 798 |
#define GGML_F16x8 float16x8_t
|
| 799 |
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
| 800 |
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
| 801 |
-
#define GGML_F16x8_LOAD
|
| 802 |
#define GGML_F16x8_STORE vst1q_f16
|
| 803 |
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
| 804 |
#define GGML_F16x8_ADD vaddq_f16
|
|
@@ -841,7 +841,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
| 841 |
#define GGML_F32Cx4 float32x4_t
|
| 842 |
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
| 843 |
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
| 844 |
-
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x))
|
| 845 |
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
| 846 |
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 847 |
#define GGML_F32Cx4_ADD vaddq_f32
|
|
|
|
| 323 |
// note: do not use these inside ggml.c
|
| 324 |
// these are meant to be used via the ggml.h API
|
| 325 |
float ggml_fp16_to_fp32(ggml_fp16_t x) {
|
| 326 |
+
return GGML_FP16_TO_FP32(x);
|
| 327 |
}
|
| 328 |
|
| 329 |
ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
|
|
| 798 |
#define GGML_F16x8 float16x8_t
|
| 799 |
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
| 800 |
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
| 801 |
+
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
|
| 802 |
#define GGML_F16x8_STORE vst1q_f16
|
| 803 |
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
| 804 |
#define GGML_F16x8_ADD vaddq_f16
|
|
|
|
| 841 |
#define GGML_F32Cx4 float32x4_t
|
| 842 |
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
| 843 |
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
| 844 |
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
|
| 845 |
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
| 846 |
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 847 |
#define GGML_F32Cx4_ADD vaddq_f32
|
ggml.h
CHANGED
|
@@ -315,13 +315,7 @@
|
|
| 315 |
extern "C" {
|
| 316 |
#endif
|
| 317 |
|
| 318 |
-
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
| 319 |
-
typedef half ggml_fp16_t;
|
| 320 |
-
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
| 321 |
-
typedef __fp16 ggml_fp16_t;
|
| 322 |
-
#else
|
| 323 |
typedef uint16_t ggml_fp16_t;
|
| 324 |
-
#endif
|
| 325 |
|
| 326 |
// convert FP16 <-> FP32
|
| 327 |
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
|
|
|
| 315 |
extern "C" {
|
| 316 |
#endif
|
| 317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
typedef uint16_t ggml_fp16_t;
|
|
|
|
| 319 |
|
| 320 |
// convert FP16 <-> FP32
|
| 321 |
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|