Spaces:
Running
Running
Michael Podvitskiy
commited on
ggml, ci : Windows ARM runner and build fixes (llama/5979)
Browse files* windows arm ci
* fix `error C2078: too many initializers` with ggml_vld1q_u32 macro for MSVC ARM64
* fix `warning C4146: unary minus operator applied to unsigned type, result still unsigned`
* fix `error C2065: '__fp16': undeclared identifier`
- ggml-impl.h +6 -2
- ggml-quants.c +8 -8
- ggml.c +2 -2
ggml-impl.h
CHANGED
|
@@ -53,26 +53,30 @@ extern "C" {
|
|
| 53 |
//
|
| 54 |
#include <arm_neon.h>
|
| 55 |
|
|
|
|
|
|
|
| 56 |
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 57 |
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
| 58 |
|
| 59 |
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 60 |
|
| 61 |
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
| 62 |
-
|
| 63 |
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
| 64 |
return (float)tmp;
|
| 65 |
}
|
| 66 |
|
| 67 |
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
| 68 |
ggml_fp16_t res;
|
| 69 |
-
|
| 70 |
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
| 71 |
return res;
|
| 72 |
}
|
| 73 |
|
| 74 |
#else
|
| 75 |
|
|
|
|
|
|
|
| 76 |
#ifdef __wasm_simd128__
|
| 77 |
#include <wasm_simd128.h>
|
| 78 |
#else
|
|
|
|
| 53 |
//
|
| 54 |
#include <arm_neon.h>
|
| 55 |
|
| 56 |
+
typedef __fp16 ggml_fp16_internal_t;
|
| 57 |
+
|
| 58 |
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 59 |
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
| 60 |
|
| 61 |
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
| 62 |
|
| 63 |
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
| 64 |
+
ggml_fp16_internal_t tmp;
|
| 65 |
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
| 66 |
return (float)tmp;
|
| 67 |
}
|
| 68 |
|
| 69 |
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
| 70 |
ggml_fp16_t res;
|
| 71 |
+
ggml_fp16_internal_t tmp = f;
|
| 72 |
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
| 73 |
return res;
|
| 74 |
}
|
| 75 |
|
| 76 |
#else
|
| 77 |
|
| 78 |
+
typedef uint16_t ggml_fp16_internal_t;
|
| 79 |
+
|
| 80 |
#ifdef __wasm_simd128__
|
| 81 |
#include <wasm_simd128.h>
|
| 82 |
#else
|
ggml-quants.c
CHANGED
|
@@ -9374,15 +9374,15 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
| 9374 |
|
| 9375 |
const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
|
| 9376 |
idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
|
| 9377 |
-
const uint32x4_t aux32x4_0 =
|
| 9378 |
-
|
| 9379 |
-
const uint32x4_t aux32x4_1 =
|
| 9380 |
-
|
| 9381 |
idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
|
| 9382 |
-
const uint32x4_t aux32x4_2 =
|
| 9383 |
-
|
| 9384 |
-
const uint32x4_t aux32x4_3 =
|
| 9385 |
-
|
| 9386 |
|
| 9387 |
|
| 9388 |
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
|
|
|
| 9374 |
|
| 9375 |
const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
|
| 9376 |
idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
|
| 9377 |
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
| 9378 |
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
|
| 9379 |
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
| 9380 |
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
|
| 9381 |
idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
|
| 9382 |
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
|
| 9383 |
+
iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
|
| 9384 |
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
|
| 9385 |
+
iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
|
| 9386 |
|
| 9387 |
|
| 9388 |
vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
|
ggml.c
CHANGED
|
@@ -857,7 +857,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
| 857 |
#define GGML_F16x8 float16x8_t
|
| 858 |
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
| 859 |
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
| 860 |
-
#define GGML_F16x8_LOAD(x) vld1q_f16((const
|
| 861 |
#define GGML_F16x8_STORE vst1q_f16
|
| 862 |
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
| 863 |
#define GGML_F16x8_ADD vaddq_f16
|
|
@@ -900,7 +900,7 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
| 900 |
#define GGML_F32Cx4 float32x4_t
|
| 901 |
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
| 902 |
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
| 903 |
-
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const
|
| 904 |
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
| 905 |
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 906 |
#define GGML_F32Cx4_ADD vaddq_f32
|
|
|
|
| 857 |
#define GGML_F16x8 float16x8_t
|
| 858 |
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
| 859 |
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
| 860 |
+
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
|
| 861 |
#define GGML_F16x8_STORE vst1q_f16
|
| 862 |
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
| 863 |
#define GGML_F16x8_ADD vaddq_f16
|
|
|
|
| 900 |
#define GGML_F32Cx4 float32x4_t
|
| 901 |
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
| 902 |
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
| 903 |
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
|
| 904 |
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
| 905 |
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 906 |
#define GGML_F32Cx4_ADD vaddq_f32
|