Michael Podvitskiy commited on
Commit
507b9dd
·
unverified ·
1 Parent(s): f3a62cc

ggml, ci : Windows ARM runner and build fixes (llama/5979)

Browse files

* windows arm ci

* fix `error C2078: too many initializers` with ggml_vld1q_u32 macro for MSVC ARM64

* fix `warning C4146: unary minus operator applied to unsigned type, result still unsigned`

* fix `error C2065: '__fp16': undeclared identifier`

Files changed (3) hide show
  1. ggml-impl.h +6 -2
  2. ggml-quants.c +8 -8
  3. ggml.c +2 -2
ggml-impl.h CHANGED
@@ -53,26 +53,30 @@ extern "C" {
53
  //
54
  #include <arm_neon.h>
55
 
 
 
56
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
57
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
58
 
59
  #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
60
 
61
  static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
62
- __fp16 tmp;
63
  memcpy(&tmp, &h, sizeof(ggml_fp16_t));
64
  return (float)tmp;
65
  }
66
 
67
  static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
68
  ggml_fp16_t res;
69
- __fp16 tmp = f;
70
  memcpy(&res, &tmp, sizeof(ggml_fp16_t));
71
  return res;
72
  }
73
 
74
  #else
75
 
 
 
76
  #ifdef __wasm_simd128__
77
  #include <wasm_simd128.h>
78
  #else
 
53
  //
54
  #include <arm_neon.h>
55
 
56
+ typedef __fp16 ggml_fp16_internal_t;
57
+
58
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
59
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
60
 
61
  #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
62
 
63
  static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
64
+ ggml_fp16_internal_t tmp;
65
  memcpy(&tmp, &h, sizeof(ggml_fp16_t));
66
  return (float)tmp;
67
  }
68
 
69
  static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
70
  ggml_fp16_t res;
71
+ ggml_fp16_internal_t tmp = f;
72
  memcpy(&res, &tmp, sizeof(ggml_fp16_t));
73
  return res;
74
  }
75
 
76
  #else
77
 
78
+ typedef uint16_t ggml_fp16_internal_t;
79
+
80
  #ifdef __wasm_simd128__
81
  #include <wasm_simd128.h>
82
  #else
ggml-quants.c CHANGED
@@ -9374,15 +9374,15 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9374
 
9375
  const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
9376
  idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
9377
- const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
9378
- iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
9379
- const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
9380
- iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
9381
  idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
9382
- const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
9383
- iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]};
9384
- const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
9385
- iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]};
9386
 
9387
 
9388
  vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
 
9374
 
9375
  const uint8x16_t idx_l = vld1q_u8(qs); qs += 16;
9376
  idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256));
9377
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
9378
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
9379
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
9380
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
9381
  idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256));
9382
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]],
9383
+ iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]);
9384
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]],
9385
+ iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]);
9386
 
9387
 
9388
  vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16)));
ggml.c CHANGED
@@ -857,7 +857,7 @@ inline static float vaddvq_f32(float32x4_t v) {
857
  #define GGML_F16x8 float16x8_t
858
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
859
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
860
- #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
861
  #define GGML_F16x8_STORE vst1q_f16
862
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
863
  #define GGML_F16x8_ADD vaddq_f16
@@ -900,7 +900,7 @@ inline static float vaddvq_f32(float32x4_t v) {
900
  #define GGML_F32Cx4 float32x4_t
901
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
902
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
903
- #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
904
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
905
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
906
  #define GGML_F32Cx4_ADD vaddq_f32
 
857
  #define GGML_F16x8 float16x8_t
858
  #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
859
  #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
860
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
861
  #define GGML_F16x8_STORE vst1q_f16
862
  #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
863
  #define GGML_F16x8_ADD vaddq_f16
 
900
  #define GGML_F32Cx4 float32x4_t
901
  #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
902
  #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
903
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
904
  #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
905
  #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
906
  #define GGML_F32Cx4_ADD vaddq_f32