cmdr2 Diego Devesa commited on
Commit
0754d43
·
1 Parent(s): 6688a2b

cpu: move all the operators into a separate c++ file (except mul_mat) (ggml/1167)

Browse files

* cpu: refactor SIMD mappings and vectorized op functions into separate files

* Fix warning for ggml_float to float

* Fix warnings

* cpu: move all the operations (except mul_mat) to a separate c++ file

* fix whitespace

* Update ggml/src/ggml-cpu/vec.h

Co-authored-by: Diego Devesa <[email protected]>

* Fix PR comments - use GGML_UNUSED, use cassert in ops.cpp

* Reverse the order of import for ops.h and vec.h, to match what was present in ggml-cpu.c previously

---------

Co-authored-by: Diego Devesa <[email protected]>

ggml/src/ggml-cpu/CMakeLists.txt CHANGED
@@ -28,6 +28,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
28
  ggml-cpu/binary-ops.cpp
29
  ggml-cpu/unary-ops.h
30
  ggml-cpu/unary-ops.cpp
 
 
 
 
 
31
  )
32
 
33
  target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
 
28
  ggml-cpu/binary-ops.cpp
29
  ggml-cpu/unary-ops.h
30
  ggml-cpu/unary-ops.cpp
31
+ ggml-cpu/simd-mappings.h
32
+ ggml-cpu/vec.h
33
+ ggml-cpu/vec.cpp
34
+ ggml-cpu/ops.h
35
+ ggml-cpu/ops.cpp
36
  )
37
 
38
  target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
ggml/src/ggml-cpu/ggml-cpu.c CHANGED
The diff for this file is too large to render. See raw diff
 
ggml/src/ggml-cpu/ops.cpp ADDED
The diff for this file is too large to render. See raw diff
 
ggml/src/ggml-cpu/ops.h ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+
5
+ //
6
+ // cache line
7
+ //
8
+
9
+ #if defined(__cpp_lib_hardware_interference_size)
10
+ #define CACHE_LINE_SIZE std::hardware_destructive_interference_size
11
+ #else
12
+ #if defined(__POWER9_VECTOR__)
13
+ #define CACHE_LINE_SIZE 128
14
+ #elif defined(__VXE__) || defined(__VXE2__)
15
+ #define CACHE_LINE_SIZE 256
16
+ #else
17
+ #define CACHE_LINE_SIZE 64
18
+ #endif
19
+ #endif
20
+
21
+ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
28
+ void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
29
+ void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
30
+ void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
31
+ void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
32
+ void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
33
+ void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
34
+ void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
35
+ void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
36
+ void ggml_compute_forward_repeat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
37
+ void ggml_compute_forward_repeat_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
38
+ void ggml_compute_forward_concat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
39
+ void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
40
+ void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
41
+ void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
42
+ void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
43
+ void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
44
+ void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
45
+ void ggml_compute_forward_out_prod(const struct ggml_compute_params * params, struct ggml_tensor * dst);
46
+ void ggml_compute_forward_scale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
47
+ void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
48
+ void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
49
+ void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
50
+ void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
51
+ void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
52
+ void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
53
+ void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
54
+ void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
55
+ void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
56
+ void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
57
+ void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
58
+ void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
59
+ void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
60
+ void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
61
+ void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
62
+ void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
63
+ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
64
+ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
65
+ void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
66
+ void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
67
+ void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
68
+ void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
69
+ void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
70
+ void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71
+ void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
72
+ void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
73
+ void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
74
+ void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
75
+ void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
76
+ void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
77
+ void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
78
+ void ggml_compute_forward_flash_attn_ext(
79
+ const struct ggml_compute_params * params,
80
+ const struct ggml_tensor * q,
81
+ const struct ggml_tensor * k,
82
+ const struct ggml_tensor * v,
83
+ const struct ggml_tensor * mask,
84
+ struct ggml_tensor * dst);
85
+ void ggml_compute_forward_flash_attn_back(
86
+ const struct ggml_compute_params * params,
87
+ const bool masked,
88
+ struct ggml_tensor * dst);
89
+ void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
90
+ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
91
+ void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
92
+ void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
93
+ void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
94
+ void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
95
+ void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
96
+ void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
97
+ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
98
+ void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
99
+ void ggml_compute_forward_map_unary(
100
+ const struct ggml_compute_params * params,
101
+ struct ggml_tensor * dst,
102
+ const ggml_unary_op_f32_t fun);
103
+ void ggml_compute_forward_map_binary(
104
+ const struct ggml_compute_params * params,
105
+ struct ggml_tensor * dst,
106
+ const ggml_binary_op_f32_t fun);
107
+ void ggml_compute_forward_map_custom1_f32(
108
+ const struct ggml_compute_params * params,
109
+ struct ggml_tensor * dst,
110
+ const ggml_custom1_op_f32_t fun);
111
+ void ggml_compute_forward_map_custom2_f32(
112
+ const struct ggml_compute_params * params,
113
+ struct ggml_tensor * dst,
114
+ const ggml_custom2_op_f32_t fun);
115
+ void ggml_compute_forward_map_custom3_f32(
116
+ const struct ggml_compute_params * params,
117
+ struct ggml_tensor * dst,
118
+ const ggml_custom3_op_f32_t fun);
119
+ void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
120
+ void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
121
+ void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
122
+ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
123
+ void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
124
+ void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
125
+
126
+ #ifdef __cplusplus
127
+ }
128
+ #endif
ggml/src/ggml-cpu/simd-mappings.h ADDED
@@ -0,0 +1,884 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "ggml-cpu-impl.h"
4
+
5
+ //
6
+ // simd mappings
7
+ //
8
+
9
+ // we define a common set of C macros which map to specific intrinsics based on the current architecture
10
+ // we then implement the fundamental computation operations below using only these macros
11
+ // adding support for new architectures requires to define the corresponding SIMD macros
12
+ //
13
+ // GGML_F32_STEP / GGML_F16_STEP
14
+ // number of elements to process in a single step
15
+ //
16
+ // GGML_F32_EPR / GGML_F16_EPR
17
+ // number of elements to fit in a single register
18
+ //
19
+
20
+ #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
21
+
22
+ #define GGML_SIMD
23
+
24
+ // F32 NEON
25
+
26
+ #define GGML_F32_STEP 16
27
+ #define GGML_F32_EPR 4
28
+
29
+ #define GGML_F32x4 float32x4_t
30
+ #define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
31
+ #define GGML_F32x4_SET1(x) vdupq_n_f32(x)
32
+ #define GGML_F32x4_LOAD vld1q_f32
33
+ #define GGML_F32x4_STORE vst1q_f32
34
+ #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
35
+ #define GGML_F32x4_ADD vaddq_f32
36
+ #define GGML_F32x4_MUL vmulq_f32
37
+ #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
38
+ #define GGML_F32x4_REDUCE(res, x) \
39
+ { \
40
+ int offset = GGML_F32_ARR >> 1; \
41
+ for (int i = 0; i < offset; ++i) { \
42
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
43
+ } \
44
+ offset >>= 1; \
45
+ for (int i = 0; i < offset; ++i) { \
46
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
47
+ } \
48
+ offset >>= 1; \
49
+ for (int i = 0; i < offset; ++i) { \
50
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
51
+ } \
52
+ (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
53
+ }
54
+
55
+ #define GGML_F32_VEC GGML_F32x4
56
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
57
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
58
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
59
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
60
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
61
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
62
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
63
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
64
+
65
+ // F16 NEON
66
+
67
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
68
+ #define GGML_F16_STEP 32
69
+ #define GGML_F16_EPR 8
70
+
71
+ #define GGML_F16x8 float16x8_t
72
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
73
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
74
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
75
+ #define GGML_F16x8_STORE vst1q_f16
76
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
77
+ #define GGML_F16x8_ADD vaddq_f16
78
+ #define GGML_F16x8_MUL vmulq_f16
79
+ #define GGML_F16x8_REDUCE(res, x) \
80
+ do { \
81
+ int offset = GGML_F16_ARR >> 1; \
82
+ for (int i = 0; i < offset; ++i) { \
83
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
84
+ } \
85
+ offset >>= 1; \
86
+ for (int i = 0; i < offset; ++i) { \
87
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
88
+ } \
89
+ offset >>= 1; \
90
+ for (int i = 0; i < offset; ++i) { \
91
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
92
+ } \
93
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
94
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
95
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
96
+ } while (0)
97
+
98
+ #define GGML_F16_VEC GGML_F16x8
99
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
100
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
101
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
102
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
103
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
104
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
105
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
106
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
107
+ #else
108
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
109
+ // and take advantage of the vcvt_ functions to convert to/from FP16
110
+
111
+ #define GGML_F16_STEP 16
112
+ #define GGML_F16_EPR 4
113
+
114
+ #define GGML_F32Cx4 float32x4_t
115
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
116
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
117
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
118
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
119
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
120
+ #define GGML_F32Cx4_ADD vaddq_f32
121
+ #define GGML_F32Cx4_MUL vmulq_f32
122
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
123
+
124
+ #define GGML_F16_VEC GGML_F32Cx4
125
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
126
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
127
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
128
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
129
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
130
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
131
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
132
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
133
+ #endif
134
+
135
+ #elif defined(__AVX512F__)
136
+
137
+ #define GGML_SIMD
138
+
139
+ // F32 AVX512
140
+
141
+ #define GGML_F32_STEP 64
142
+ #define GGML_F32_EPR 16
143
+
144
+ #define GGML_F32x16 __m512
145
+ #define GGML_F32x16_ZERO _mm512_setzero_ps()
146
+ #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
147
+ #define GGML_F32x16_LOAD _mm512_loadu_ps
148
+ #define GGML_F32x16_STORE _mm512_storeu_ps
149
+ // _mm512_fmadd_ps is defined in AVX512F so no guard is required
150
+ #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
151
+ #define GGML_F32x16_ADD _mm512_add_ps
152
+ #define GGML_F32x16_MUL _mm512_mul_ps
153
+ #define GGML_F32x16_REDUCE(res, x) \
154
+ do { \
155
+ int offset = GGML_F32_ARR >> 1; \
156
+ for (int i = 0; i < offset; ++i) { \
157
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
158
+ } \
159
+ offset >>= 1; \
160
+ for (int i = 0; i < offset; ++i) { \
161
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
162
+ } \
163
+ offset >>= 1; \
164
+ for (int i = 0; i < offset; ++i) { \
165
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
166
+ } \
167
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
168
+ } while (0)
169
+
170
+ // TODO: is this optimal ?
171
+
172
+ #define GGML_F32_VEC GGML_F32x16
173
+ #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
174
+ #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
175
+ #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
176
+ #define GGML_F32_VEC_STORE GGML_F32x16_STORE
177
+ #define GGML_F32_VEC_FMA GGML_F32x16_FMA
178
+ #define GGML_F32_VEC_ADD GGML_F32x16_ADD
179
+ #define GGML_F32_VEC_MUL GGML_F32x16_MUL
180
+ #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
181
+
182
+ // F16 AVX512
183
+
184
+ // F16 AVX
185
+
186
+ #define GGML_F16_STEP 64
187
+ #define GGML_F16_EPR 16
188
+
189
+ // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
190
+
191
+ #define GGML_F32Cx16 __m512
192
+ #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
193
+ #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
194
+
195
+ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
196
+ // so F16C guard isn't required
197
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
198
+ #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
199
+
200
+ #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
201
+ #define GGML_F32Cx16_ADD _mm512_add_ps
202
+ #define GGML_F32Cx16_MUL _mm512_mul_ps
203
+ #define GGML_F32Cx16_REDUCE(res, x) \
204
+ do { \
205
+ int offset = GGML_F32_ARR >> 1; \
206
+ for (int i = 0; i < offset; ++i) { \
207
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
208
+ } \
209
+ offset >>= 1; \
210
+ for (int i = 0; i < offset; ++i) { \
211
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
212
+ } \
213
+ offset >>= 1; \
214
+ for (int i = 0; i < offset; ++i) { \
215
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
216
+ } \
217
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
218
+ } while (0)
219
+
220
+ #define GGML_F16_VEC GGML_F32Cx16
221
+ #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
222
+ #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
223
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
224
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
225
+ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
226
+ #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
227
+ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
228
+
229
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
230
+ #elif defined(__AVX__)
231
+
232
+ #define GGML_SIMD
233
+
234
+ // F32 AVX
235
+
236
+ #define GGML_F32_STEP 32
237
+ #define GGML_F32_EPR 8
238
+
239
+ #define GGML_F32x8 __m256
240
+ #define GGML_F32x8_ZERO _mm256_setzero_ps()
241
+ #define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
242
+ #define GGML_F32x8_LOAD _mm256_loadu_ps
243
+ #define GGML_F32x8_STORE _mm256_storeu_ps
244
+ #if defined(__FMA__)
245
+ #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
246
+ #else
247
+ #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
248
+ #endif
249
+ #define GGML_F32x8_ADD _mm256_add_ps
250
+ #define GGML_F32x8_MUL _mm256_mul_ps
251
+ #define GGML_F32x8_REDUCE(res, x) \
252
+ do { \
253
+ int offset = GGML_F32_ARR >> 1; \
254
+ for (int i = 0; i < offset; ++i) { \
255
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
256
+ } \
257
+ offset >>= 1; \
258
+ for (int i = 0; i < offset; ++i) { \
259
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
260
+ } \
261
+ offset >>= 1; \
262
+ for (int i = 0; i < offset; ++i) { \
263
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
264
+ } \
265
+ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
266
+ _mm256_extractf128_ps(x[0], 1)); \
267
+ const __m128 t1 = _mm_hadd_ps(t0, t0); \
268
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
269
+ } while (0)
270
+ // TODO: is this optimal ?
271
+
272
+ #define GGML_F32_VEC GGML_F32x8
273
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
274
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
275
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
276
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
277
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
278
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
279
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
280
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
281
+
282
+ // F16 AVX
283
+
284
+ #define GGML_F16_STEP 32
285
+ #define GGML_F16_EPR 8
286
+
287
+ // F16 arithmetic is not supported by AVX, so we use F32 instead
288
+
289
+ #define GGML_F32Cx8 __m256
290
+ #define GGML_F32Cx8_ZERO _mm256_setzero_ps()
291
+ #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
292
+
293
+ #if defined(__F16C__)
294
+ // the _mm256_cvt intrinsics require F16C
295
+ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
296
+ #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
297
+ #else
298
+ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
299
+ float tmp[8];
300
+
301
+ for (int i = 0; i < 8; i++) {
302
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
303
+ }
304
+
305
+ return _mm256_loadu_ps(tmp);
306
+ }
307
+ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
308
+ float arr[8];
309
+
310
+ _mm256_storeu_ps(arr, y);
311
+
312
+ for (int i = 0; i < 8; i++)
313
+ x[i] = GGML_FP32_TO_FP16(arr[i]);
314
+ }
315
+ #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
316
+ #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
317
+ #endif
318
+
319
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
320
+ #define GGML_F32Cx8_ADD _mm256_add_ps
321
+ #define GGML_F32Cx8_MUL _mm256_mul_ps
322
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
323
+
324
+ #define GGML_F16_VEC GGML_F32Cx8
325
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
326
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
327
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
328
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
329
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
330
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
331
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
332
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
333
+
334
+ #elif defined(__POWER9_VECTOR__)
335
+
336
+ #define GGML_SIMD
337
+
338
+ // F32 POWER9
339
+
340
+ #define GGML_F32_STEP 32
341
+ #define GGML_F32_EPR 4
342
+
343
+ #define GGML_F32x4 vector float
344
+ #define GGML_F32x4_ZERO 0.0f
345
+ #define GGML_F32x4_SET1 vec_splats
346
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
347
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
348
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
349
+ #define GGML_F32x4_ADD vec_add
350
+ #define GGML_F32x4_MUL vec_mul
351
+ #define GGML_F32x4_REDUCE(res, x) \
352
+ { \
353
+ int offset = GGML_F32_ARR >> 1; \
354
+ for (int i = 0; i < offset; ++i) { \
355
+ x[i] = vec_add(x[i], x[offset+i]); \
356
+ } \
357
+ offset >>= 1; \
358
+ for (int i = 0; i < offset; ++i) { \
359
+ x[i] = vec_add(x[i], x[offset+i]); \
360
+ } \
361
+ offset >>= 1; \
362
+ for (int i = 0; i < offset; ++i) { \
363
+ x[i] = vec_add(x[i], x[offset+i]); \
364
+ } \
365
+ res = vec_extract(x[0], 0) + \
366
+ vec_extract(x[0], 1) + \
367
+ vec_extract(x[0], 2) + \
368
+ vec_extract(x[0], 3); \
369
+ }
370
+
371
+ #define GGML_F32_VEC GGML_F32x4
372
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
373
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
374
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
375
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
376
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
377
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
378
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
379
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
380
+
381
+ // F16 POWER9
382
+ #define GGML_F16_STEP GGML_F32_STEP
383
+ #define GGML_F16_EPR GGML_F32_EPR
384
+ #define GGML_F16_VEC GGML_F32x4
385
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
386
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
387
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
388
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
389
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
390
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
391
+ // Use vec_xl, not vec_ld, in case the load address is not aligned.
392
+ #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
393
+ vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
394
+ vec_extract_fp32_from_shortl(vec_xl(0, p))
395
+ #define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
396
+ #define GGML_F16_VEC_STORE(p, r, i) \
397
+ if (i & 0x1) \
398
+ vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
399
+ r[i - GGML_ENDIAN_BYTE(0)]), \
400
+ 0, p - GGML_F16_EPR)
401
+
402
+ #elif defined(__wasm_simd128__)
403
+
404
+ #define GGML_SIMD
405
+
406
+ // F32 WASM
407
+
408
+ #define GGML_F32_STEP 16
409
+ #define GGML_F32_EPR 4
410
+
411
+ #define GGML_F32x4 v128_t
412
+ #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
413
+ #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
414
+ #define GGML_F32x4_LOAD wasm_v128_load
415
+ #define GGML_F32x4_STORE wasm_v128_store
416
+ #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
417
+ #define GGML_F32x4_ADD wasm_f32x4_add
418
+ #define GGML_F32x4_MUL wasm_f32x4_mul
419
+ #define GGML_F32x4_REDUCE(res, x) \
420
+ { \
421
+ int offset = GGML_F32_ARR >> 1; \
422
+ for (int i = 0; i < offset; ++i) { \
423
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
424
+ } \
425
+ offset >>= 1; \
426
+ for (int i = 0; i < offset; ++i) { \
427
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
428
+ } \
429
+ offset >>= 1; \
430
+ for (int i = 0; i < offset; ++i) { \
431
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
432
+ } \
433
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
434
+ wasm_f32x4_extract_lane(x[0], 1) + \
435
+ wasm_f32x4_extract_lane(x[0], 2) + \
436
+ wasm_f32x4_extract_lane(x[0], 3); \
437
+ }
438
+
439
+ #define GGML_F32_VEC GGML_F32x4
440
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
441
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
442
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
443
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
444
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
445
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
446
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
447
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
448
+
449
+ // F16 WASM
450
+
451
+ #define GGML_F16_STEP 16
452
+ #define GGML_F16_EPR 4
453
+
454
+ inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
455
+ float tmp[4];
456
+
457
+ tmp[0] = GGML_FP16_TO_FP32(p[0]);
458
+ tmp[1] = GGML_FP16_TO_FP32(p[1]);
459
+ tmp[2] = GGML_FP16_TO_FP32(p[2]);
460
+ tmp[3] = GGML_FP16_TO_FP32(p[3]);
461
+
462
+ return wasm_v128_load(tmp);
463
+ }
464
+
465
+ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
466
+ float tmp[4];
467
+
468
+ wasm_v128_store(tmp, x);
469
+
470
+ p[0] = GGML_FP32_TO_FP16(tmp[0]);
471
+ p[1] = GGML_FP32_TO_FP16(tmp[1]);
472
+ p[2] = GGML_FP32_TO_FP16(tmp[2]);
473
+ p[3] = GGML_FP32_TO_FP16(tmp[3]);
474
+ }
475
+
476
+ #define GGML_F16x4 v128_t
477
+ #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
478
+ #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
479
+ #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
480
+ #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
481
+ #define GGML_F16x4_FMA GGML_F32x4_FMA
482
+ #define GGML_F16x4_ADD wasm_f32x4_add
483
+ #define GGML_F16x4_MUL wasm_f32x4_mul
484
+ #define GGML_F16x4_REDUCE(res, x) \
485
+ { \
486
+ int offset = GGML_F16_ARR >> 1; \
487
+ for (int i = 0; i < offset; ++i) { \
488
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
489
+ } \
490
+ offset >>= 1; \
491
+ for (int i = 0; i < offset; ++i) { \
492
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
493
+ } \
494
+ offset >>= 1; \
495
+ for (int i = 0; i < offset; ++i) { \
496
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
497
+ } \
498
+ res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
499
+ wasm_f32x4_extract_lane(x[0], 1) + \
500
+ wasm_f32x4_extract_lane(x[0], 2) + \
501
+ wasm_f32x4_extract_lane(x[0], 3)); \
502
+ }
503
+
504
+ #define GGML_F16_VEC GGML_F16x4
505
+ #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
506
+ #define GGML_F16_VEC_SET1 GGML_F16x4_SET1
507
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
508
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
509
+ #define GGML_F16_VEC_FMA GGML_F16x4_FMA
510
+ #define GGML_F16_VEC_ADD GGML_F16x4_ADD
511
+ #define GGML_F16_VEC_MUL GGML_F16x4_MUL
512
+ #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
513
+
514
+ #elif defined(__SSE3__)
515
+
516
+ #define GGML_SIMD
517
+
518
+ // F32 SSE
519
+
520
+ #define GGML_F32_STEP 32
521
+ #define GGML_F32_EPR 4
522
+
523
+ #define GGML_F32x4 __m128
524
+ #define GGML_F32x4_ZERO _mm_setzero_ps()
525
+ #define GGML_F32x4_SET1(x) _mm_set1_ps(x)
526
+ #define GGML_F32x4_LOAD _mm_loadu_ps
527
+ #define GGML_F32x4_STORE _mm_storeu_ps
528
+ #if defined(__FMA__)
529
+ // TODO: Does this work?
530
+ #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
531
+ #else
532
+ #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
533
+ #endif
534
+ #define GGML_F32x4_ADD _mm_add_ps
535
+ #define GGML_F32x4_MUL _mm_mul_ps
536
+ #define GGML_F32x4_REDUCE(res, x) \
537
+ { \
538
+ int offset = GGML_F32_ARR >> 1; \
539
+ for (int i = 0; i < offset; ++i) { \
540
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
541
+ } \
542
+ offset >>= 1; \
543
+ for (int i = 0; i < offset; ++i) { \
544
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
545
+ } \
546
+ offset >>= 1; \
547
+ for (int i = 0; i < offset; ++i) { \
548
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
549
+ } \
550
+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
551
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
552
+ }
553
+ // TODO: is this optimal ?
554
+
555
+ #define GGML_F32_VEC GGML_F32x4
556
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
557
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
558
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
559
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
560
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
561
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
562
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
563
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
564
+
565
+ // F16 SSE
566
+
567
+ #define GGML_F16_STEP 32
568
+ #define GGML_F16_EPR 4
569
+
570
+ static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
571
+ float tmp[4];
572
+
573
+ tmp[0] = GGML_FP16_TO_FP32(x[0]);
574
+ tmp[1] = GGML_FP16_TO_FP32(x[1]);
575
+ tmp[2] = GGML_FP16_TO_FP32(x[2]);
576
+ tmp[3] = GGML_FP16_TO_FP32(x[3]);
577
+
578
+ return _mm_loadu_ps(tmp);
579
+ }
580
+
581
+ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
582
+ float arr[4];
583
+
584
+ _mm_storeu_ps(arr, y);
585
+
586
+ x[0] = GGML_FP32_TO_FP16(arr[0]);
587
+ x[1] = GGML_FP32_TO_FP16(arr[1]);
588
+ x[2] = GGML_FP32_TO_FP16(arr[2]);
589
+ x[3] = GGML_FP32_TO_FP16(arr[3]);
590
+ }
591
+
592
+ #define GGML_F32Cx4 __m128
593
+ #define GGML_F32Cx4_ZERO _mm_setzero_ps()
594
+ #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
595
+ #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
596
+ #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
597
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
598
+ #define GGML_F32Cx4_ADD _mm_add_ps
599
+ #define GGML_F32Cx4_MUL _mm_mul_ps
600
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
601
+
602
+ #define GGML_F16_VEC GGML_F32Cx4
603
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
604
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
605
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
606
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
607
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
608
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
609
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
610
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
611
+
612
+ #elif defined(__loongarch_asx)
613
+
614
+ #define GGML_SIMD
615
+
616
+ // F32 LASX
617
+ #define GGML_F32_STEP 32
618
+ #define GGML_F32_EPR 8
619
+
620
+ #define GGML_F32x8 __m256
621
+ #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
622
+ #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
623
+ #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
624
+ #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
625
+ #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
626
+ #define GGML_F32x8_ADD __lasx_xvfadd_s
627
+ #define GGML_F32x8_MUL __lasx_xvfmul_s
628
+ #define GGML_F32x8_REDUCE(res, x) \
629
+ do { \
630
+ int offset = GGML_F32_ARR >> 1; \
631
+ for (int i = 0; i < offset; ++i) { \
632
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
633
+ } \
634
+ offset >>= 1; \
635
+ for (int i = 0; i < offset; ++i) { \
636
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
637
+ } \
638
+ offset >>= 1; \
639
+ for (int i = 0; i < offset; ++i) { \
640
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
641
+ } \
642
+ float *tmp_p = (float *)&x[0]; \
643
+ res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
644
+ } while (0)
645
+ // TODO: is this optimal ?
646
+
647
+ #define GGML_F32_VEC GGML_F32x8
648
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
649
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
650
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
651
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
652
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
653
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
654
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
655
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
656
+
657
+ // F16 LASX
658
+
659
+ #define GGML_F16_STEP 32
660
+ #define GGML_F16_EPR 8
661
+
662
+ // F16 arithmetic is not supported by LASX, so we use F32 instead
663
+
664
+ #define GGML_F32Cx8 __m256
665
+ #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
666
+ #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
667
+
668
+ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
669
+ __m256i a;
670
+ memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
671
+ a = __lasx_xvpermi_d(a, 0 | (1 << 4));
672
+ return __lasx_xvfcvtl_s_h(a);
673
+ }
674
+
675
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
676
+ __m256i a = __lasx_xvfcvt_h_s(y, y);
677
+ a = __lasx_xvpermi_d(a, 0 | (2 << 2));
678
+ memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
679
+ }
680
+ #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
681
+ #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
682
+
683
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
684
+ #define GGML_F32Cx8_ADD __lasx_xvfadd_s
685
+ #define GGML_F32Cx8_MUL __lasx_xvfmul_s
686
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
687
+
688
+ #define GGML_F16_VEC GGML_F32Cx8
689
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
690
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
691
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
692
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
693
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
694
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
695
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
696
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
697
+
698
+ #elif defined(__loongarch_sx)
699
+
700
+ #define GGML_SIMD
701
+
702
+ // F32 LSX
703
+
704
+ #define GGML_F32_STEP 32
705
+ #define GGML_F32_EPR 4
706
+
707
+ #define GGML_F32x4 __m128
708
+ #define GGML_F32x4_ZERO __lsx_vldi(0)
709
+ #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
710
+ #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
711
+ #define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
712
+ #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
713
+ #define GGML_F32x4_ADD __lsx_vfadd_s
714
+ #define GGML_F32x4_MUL __lsx_vfmul_s
715
+ #define GGML_F32x4_REDUCE(res, x) \
716
+ { \
717
+ int offset = GGML_F32_ARR >> 1; \
718
+ for (int i = 0; i < offset; ++i) { \
719
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
720
+ } \
721
+ offset >>= 1; \
722
+ for (int i = 0; i < offset; ++i) { \
723
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
724
+ } \
725
+ offset >>= 1; \
726
+ for (int i = 0; i < offset; ++i) { \
727
+ x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
728
+ } \
729
+ __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
730
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
731
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
732
+ const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
733
+ tmp = __lsx_vsrli_d((__m128i) t0, 32); \
734
+ tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
735
+ tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
736
+ res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
737
+ }
738
+
739
+ #define GGML_F32_VEC GGML_F32x4
740
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
741
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
742
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
743
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
744
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
745
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
746
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
747
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
748
+
749
+ // F16 LSX
750
+
751
+ #define GGML_F16_STEP 32
752
+ #define GGML_F16_EPR 4
753
+
754
+ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
755
+ float tmp[4];
756
+
757
+ tmp[0] = GGML_FP16_TO_FP32(x[0]);
758
+ tmp[1] = GGML_FP16_TO_FP32(x[1]);
759
+ tmp[2] = GGML_FP16_TO_FP32(x[2]);
760
+ tmp[3] = GGML_FP16_TO_FP32(x[3]);
761
+
762
+ return __lsx_vld(tmp, 0);
763
+ }
764
+
765
+ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
766
+ float arr[4];
767
+
768
+ __lsx_vst(y, arr, 0);
769
+
770
+ x[0] = GGML_FP32_TO_FP16(arr[0]);
771
+ x[1] = GGML_FP32_TO_FP16(arr[1]);
772
+ x[2] = GGML_FP32_TO_FP16(arr[2]);
773
+ x[3] = GGML_FP32_TO_FP16(arr[3]);
774
+ }
775
+
776
+ #define GGML_F32Cx4 __m128
777
+ #define GGML_F32Cx4_ZERO __lsx_vldi(0)
778
+ #define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
779
+ #define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
780
+ #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
781
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
782
+ #define GGML_F32Cx4_ADD __lsx_vfadd_s
783
+ #define GGML_F32Cx4_MUL __lsx_vfmul_s
784
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
785
+
786
+ #define GGML_F16_VEC GGML_F32Cx4
787
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
788
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
789
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
790
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
791
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
792
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
793
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
794
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
795
+
796
+ #elif defined(__VXE__) || defined(__VXE2__)
797
+
798
+ #define GGML_SIMD
799
+
800
+ // F32 s390x
801
+
802
+ #define GGML_F32_STEP 32
803
+ #define GGML_F32_EPR 4
804
+
805
+ #define GGML_F32x4 __vector float
806
+ #define GGML_F32x4_ZERO vec_splats(0.0f)
807
+ #define GGML_F32x4_SET1 vec_splats
808
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
809
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
810
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
811
+ #define GGML_F32x4_ADD vec_add
812
+ #define GGML_F32x4_MUL vec_mul
813
+ #define GGML_F32x4_REDUCE(res, x) \
814
+ { \
815
+ int offset = GGML_F32_ARR >> 1; \
816
+ for (int i = 0; i < offset; ++i) { \
817
+ x[i] = vec_add(x[i], x[offset + i]); \
818
+ } \
819
+ offset >>= 1; \
820
+ for (int i = 0; i < offset; ++i) { \
821
+ x[i] = vec_add(x[i], x[offset + i]); \
822
+ } \
823
+ offset >>= 1; \
824
+ for (int i = 0; i < offset; ++i) { \
825
+ x[i] = vec_add(x[i], x[offset + i]); \
826
+ } \
827
+ res = vec_extract(x[0], 0) + \
828
+ vec_extract(x[0], 1) + \
829
+ vec_extract(x[0], 2) + \
830
+ vec_extract(x[0], 3); \
831
+ }
832
+
833
+ #define GGML_F32_VEC GGML_F32x4
834
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
835
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
836
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
837
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
838
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
839
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
840
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
841
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
842
+
843
+ // F16 s390x
844
+ #define GGML_F16_STEP GGML_F32_STEP
845
+ #define GGML_F16_EPR GGML_F32_EPR
846
+
847
+ static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
848
+ float tmp[4];
849
+
850
+ for (int i = 0; i < 4; i++) {
851
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
852
+ }
853
+
854
+ return vec_xl(0, tmp);
855
+ }
856
+
857
+ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
858
+ float arr[4];
859
+
860
+ vec_xst(y, 0, arr);
861
+
862
+ for (int i = 0; i < 4; i++) {
863
+ x[i] = GGML_FP32_TO_FP16(arr[i]);
864
+ }
865
+ }
866
+
867
+ #define GGML_F16_VEC GGML_F32x4
868
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
869
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
870
+ #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
871
+ #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
872
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
873
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
874
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
875
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
876
+
877
+ #endif
878
+
879
+ // GGML_F32_ARR / GGML_F16_ARR
880
+ // number of registers to use per step
881
+ #ifdef GGML_SIMD
882
+ #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
883
+ #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
884
+ #endif
ggml/src/ggml-cpu/vec.cpp ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "vec.h"
2
+
3
+ #include <cassert>
4
+
5
+ #if defined(_MSC_VER)
6
+ // disable "possible loss of data" to avoid hundreds of casts
7
+ // we should just be careful :)
8
+ #pragma warning(disable: 4244 4267)
9
+ #endif
10
+
11
+ // precomputed gelu table for f16 (128 KB)
12
+ ggml_fp16_t ggml_table_gelu_f16[1 << 16];
13
+
14
+ // precomputed quick gelu table for f16 (128 KB)
15
+ ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
16
+
17
+ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
18
+ assert(nrc == 1);
19
+ GGML_UNUSED(nrc);
20
+ GGML_UNUSED(bx);
21
+ GGML_UNUSED(by);
22
+ GGML_UNUSED(bs);
23
+
24
+ #if defined(GGML_SIMD)
25
+ float sumf = 0.0f;
26
+ const int np = (n & ~(GGML_F32_STEP - 1));
27
+
28
+ GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
29
+
30
+ GGML_F32_VEC ax[GGML_F32_ARR];
31
+ GGML_F32_VEC ay[GGML_F32_ARR];
32
+
33
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
34
+ for (int j = 0; j < GGML_F32_ARR; j++) {
35
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
36
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
37
+
38
+ sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
39
+ }
40
+ }
41
+
42
+ // reduce sum0..sum3 to sum0
43
+ GGML_F32_VEC_REDUCE(sumf, sum);
44
+
45
+ // leftovers
46
+ for (int i = np; i < n; ++i) {
47
+ sumf += x[i]*y[i];
48
+ }
49
+ #else
50
+ // scalar
51
+ ggml_float sumf = 0.0;
52
+ for (int i = 0; i < n; ++i) {
53
+ sumf += (ggml_float)(x[i]*y[i]);
54
+ }
55
+ #endif
56
+
57
+ *s = sumf;
58
+ }
59
+
60
+ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
61
+ assert(nrc == 1);
62
+ GGML_UNUSED(nrc);
63
+ GGML_UNUSED(bx);
64
+ GGML_UNUSED(by);
65
+ GGML_UNUSED(bs);
66
+ int i = 0;
67
+ ggml_float sumf = 0;
68
+
69
+ #if defined(__AVX512BF16__)
70
+ __m512 c1 = _mm512_setzero_ps();
71
+ __m512 c2 = _mm512_setzero_ps();
72
+ for (; i + 64 <= n; i += 64) {
73
+ c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
74
+ m512bh(_mm512_loadu_si512((y + i))));
75
+ c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
76
+ m512bh(_mm512_loadu_si512((y + i + 32))));
77
+ }
78
+ sumf += (ggml_float)_mm512_reduce_add_ps(c1);
79
+ sumf += (ggml_float)_mm512_reduce_add_ps(c2);
80
+
81
+ #elif defined(__AVX512F__)
82
+ #define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
83
+ __m512 c1 = _mm512_setzero_ps();
84
+ __m512 c2 = _mm512_setzero_ps();
85
+ for (; i + 32 <= n; i += 32) {
86
+ c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
87
+ c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
88
+ }
89
+ sumf += (ggml_float)_mm512_reduce_add_ps(c1);
90
+ sumf += (ggml_float)_mm512_reduce_add_ps(c2);
91
+
92
+ #undef LOAD
93
+ #elif defined(__AVX2__) || defined(__AVX__)
94
+ #if defined(__AVX2__)
95
+ #define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
96
+ #else
97
+ #define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
98
+ #endif
99
+ __m256 c1 = _mm256_setzero_ps();
100
+ __m256 c2 = _mm256_setzero_ps();
101
+ __m256 c3 = _mm256_setzero_ps();
102
+ __m256 c4 = _mm256_setzero_ps();
103
+ for (; i + 32 <= n; i += 32) {
104
+ c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
105
+ c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
106
+ c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
107
+ c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
108
+ }
109
+ __m128 g;
110
+ c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
111
+ _mm256_add_ps(c2, c4));
112
+ g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
113
+ _mm256_castps256_ps128(c1));
114
+ g = _mm_add_ps(g, _mm_movehl_ps(g, g));
115
+ g = _mm_add_ss(g, _mm_movehdup_ps(g));
116
+ sumf += (ggml_float)_mm_cvtss_f32(g);
117
+
118
+ #undef LOAD
119
+ #endif
120
+
121
+ for (; i < n; ++i) {
122
+ sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
123
+ GGML_BF16_TO_FP32(y[i]));
124
+ }
125
+ *s = sumf;
126
+ }
127
+
128
+ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
129
+ assert(nrc == 1);
130
+ GGML_UNUSED(nrc);
131
+ GGML_UNUSED(bx);
132
+ GGML_UNUSED(by);
133
+ GGML_UNUSED(bs);
134
+
135
+ ggml_float sumf = 0.0;
136
+
137
+ #if defined(GGML_SIMD)
138
+ const int np = (n & ~(GGML_F16_STEP - 1));
139
+
140
+ GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
141
+
142
+ GGML_F16_VEC ax[GGML_F16_ARR];
143
+ GGML_F16_VEC ay[GGML_F16_ARR];
144
+
145
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
146
+ for (int j = 0; j < GGML_F16_ARR; j++) {
147
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
148
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
149
+
150
+ sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
151
+ }
152
+ }
153
+
154
+ // reduce sum0..sum3 to sum0
155
+ GGML_F16_VEC_REDUCE(sumf, sum);
156
+
157
+ // leftovers
158
+ for (int i = np; i < n; ++i) {
159
+ sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
160
+ }
161
+ #else
162
+ for (int i = 0; i < n; ++i) {
163
+ sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
164
+ }
165
+ #endif
166
+
167
+ *s = sumf;
168
+ }
169
+
170
+ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
171
+ int i = 0;
172
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
173
+ for (; i + 15 < n; i += 16) {
174
+ _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
175
+ }
176
+ #elif defined(__AVX2__) && defined(__FMA__)
177
+ for (; i + 7 < n; i += 8) {
178
+ _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
179
+ }
180
+ #elif defined(__SSE2__)
181
+ for (; i + 3 < n; i += 4) {
182
+ _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
183
+ }
184
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
185
+ for (; i + 3 < n; i += 4) {
186
+ vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
187
+ }
188
+ #endif
189
+ for (; i < n; ++i) {
190
+ y[i] = ggml_silu_f32(x[i]);
191
+ }
192
+ }
193
+
194
+ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
195
+ int i = 0;
196
+ ggml_float sum = 0;
197
+ #if defined(__AVX512F__) && defined(__AVX512DQ__)
198
+ for (; i + 15 < n; i += 16) {
199
+ __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
200
+ _mm512_set1_ps(max)));
201
+ _mm512_storeu_ps(y + i, val);
202
+ sum += (ggml_float)_mm512_reduce_add_ps(val);
203
+ }
204
+ #elif defined(__AVX2__) && defined(__FMA__)
205
+ for (; i + 7 < n; i += 8) {
206
+ __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
207
+ _mm256_set1_ps(max)));
208
+ _mm256_storeu_ps(y + i, val);
209
+ __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
210
+ _mm256_castps256_ps128(val));
211
+ val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
212
+ val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
213
+ sum += (ggml_float)_mm_cvtss_f32(val2);
214
+ }
215
+ #elif defined(__SSE2__)
216
+ for (; i + 3 < n; i += 4) {
217
+ __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
218
+ _mm_set1_ps(max)));
219
+ _mm_storeu_ps(y + i, val);
220
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
221
+ val = _mm_add_ps(val, _mm_movehl_ps(val, val));
222
+ val = _mm_add_ss(val, _mm_movehdup_ps(val));
223
+ #else
224
+ __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
225
+ val = _mm_add_ps(val, tmp);
226
+ tmp = _mm_movehl_ps(tmp, val);
227
+ val = _mm_add_ss(val, tmp);
228
+ #endif
229
+ sum += (ggml_float)_mm_cvtss_f32(val);
230
+ }
231
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
232
+ for (; i + 3 < n; i += 4) {
233
+ float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
234
+ vdupq_n_f32(max)));
235
+ vst1q_f32(y + i, val);
236
+ sum += (ggml_float)vaddvq_f32(val);
237
+ }
238
+ #endif
239
+ for (; i < n; ++i) {
240
+ float val = expf(x[i] - max);
241
+ sum += (ggml_float)val;
242
+ y[i] = val;
243
+ }
244
+ return sum;
245
+ }
246
+
247
+ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
248
+ // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
249
+
250
+ int i = 0;
251
+ ggml_float sum = 0;
252
+ for (; i < n; ++i) {
253
+ float val = x[i] - max;
254
+ y[i] = val;
255
+ sum += (ggml_float)expf(val);
256
+ }
257
+ return sum = (ggml_float)logf(sum);
258
+ }
ggml/src/ggml-cpu/vec.h ADDED
@@ -0,0 +1,802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Vectorized functions for fundamental operations
2
+
3
+ #pragma once
4
+
5
+ #include "ggml-impl.h"
6
+ #include "simd-mappings.h"
7
+ #include "ggml.h"
8
+
9
+ #if defined(GGML_USE_ACCELERATE)
10
+ #include <Accelerate/Accelerate.h>
11
+ #endif
12
+
13
+ // floating point type used to accumulate sums
14
+ typedef double ggml_float;
15
+
16
+ #define GGML_GELU_FP16
17
+ #define GGML_GELU_QUICK_FP16
18
+
19
+ #define GGML_SOFT_MAX_UNROLL 4
20
+ #define GGML_VEC_DOT_UNROLL 2
21
+ #define GGML_VEC_MAD_UNROLL 32
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ //
28
+ // global data
29
+ //
30
+
31
+ // precomputed gelu table for f16 (128 KB)
32
+ extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
33
+
34
+ // precomputed quick gelu table for f16 (128 KB)
35
+ extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
36
+
37
+ //
38
+ // fundamental operations
39
+ //
40
+
41
+ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
42
+ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
43
+ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
44
+
45
+ void ggml_vec_silu_f32(const int n, float * y, const float * x);
46
+ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
47
+ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
48
+
49
+ inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
50
+ inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
51
+
52
+ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
53
+ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
54
+
55
+ inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
56
+ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
57
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
58
+ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
59
+ for (int i = 0; i < n; ++i) {
60
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
61
+ }
62
+ }
63
+ inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
64
+ inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
65
+ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
66
+ inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
67
+ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
68
+ for (int i = 0; i < n; ++i) {
69
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
70
+ }
71
+ }
72
+ inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
73
+ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
74
+ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
75
+ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
76
+ for (int i = 0; i < n; ++i) {
77
+ y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
78
+ }
79
+ }
80
+
81
+ inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
82
+ inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
83
+ for (int i = 0; i < n; ++i) {
84
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
85
+ }
86
+ }
87
+ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
88
+ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
89
+ for (int i = 0; i < n; ++i) {
90
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
91
+ }
92
+ }
93
+
94
+ // compute GGML_VEC_DOT_UNROLL dot products at once
95
+ // xs - x row stride in bytes
96
+ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
97
+ ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
98
+
99
+ ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
100
+
101
+ for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
102
+ x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
103
+ }
104
+
105
+ #if defined(GGML_SIMD)
106
+ const int np = (n & ~(GGML_F16_STEP - 1));
107
+
108
+ GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
109
+
110
+ GGML_F16_VEC ax[GGML_F16_ARR];
111
+ GGML_F16_VEC ay[GGML_F16_ARR];
112
+
113
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
114
+ for (int j = 0; j < GGML_F16_ARR; j++) {
115
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
116
+
117
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
118
+ ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
119
+
120
+ sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
121
+ }
122
+ }
123
+ }
124
+
125
+ // reduce sum0..sum3 to sum0
126
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
127
+ GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
128
+ }
129
+
130
+ // leftovers
131
+ for (int i = np; i < n; ++i) {
132
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
133
+ sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
134
+ }
135
+ }
136
+ #else
137
+ for (int i = 0; i < n; ++i) {
138
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
139
+ sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
140
+ }
141
+ }
142
+ #endif
143
+
144
+ for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
145
+ s[i] = (float)sumf[i];
146
+ }
147
+ }
148
+
149
+ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
150
+ #if defined(GGML_SIMD)
151
+ const int np = (n & ~(GGML_F32_STEP - 1));
152
+
153
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
154
+
155
+ GGML_F32_VEC ax[GGML_F32_ARR];
156
+ GGML_F32_VEC ay[GGML_F32_ARR];
157
+
158
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
159
+ for (int j = 0; j < GGML_F32_ARR; j++) {
160
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
161
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
162
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
163
+
164
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
165
+ }
166
+ }
167
+
168
+ // leftovers
169
+ for (int i = np; i < n; ++i) {
170
+ y[i] += x[i]*v;
171
+ }
172
+ #else
173
+ // scalar
174
+ for (int i = 0; i < n; ++i) {
175
+ y[i] += x[i]*v;
176
+ }
177
+ #endif
178
+ }
179
+
180
+ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
181
+ #if defined(GGML_SIMD)
182
+ const int np = (n & ~(GGML_F16_STEP - 1));
183
+
184
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
185
+
186
+ GGML_F16_VEC ax[GGML_F16_ARR];
187
+ GGML_F16_VEC ay[GGML_F16_ARR];
188
+
189
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
190
+ for (int j = 0; j < GGML_F16_ARR; j++) {
191
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
192
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
193
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
194
+
195
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
196
+ }
197
+ }
198
+
199
+ // leftovers
200
+ for (int i = np; i < n; ++i) {
201
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
202
+ }
203
+ #else
204
+ // scalar
205
+ for (int i = 0; i < n; ++i) {
206
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
207
+ }
208
+ #endif
209
+ }
210
+
211
+ // xs and vs are byte strides of x and v
212
+ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
213
+
214
+ const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
215
+ const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
216
+
217
+ for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
218
+ x[i] = (const float *) ((const char *) xv + i*xs);
219
+ v[i] = (const float *) ((const char *) vv + i*vs);
220
+ }
221
+
222
+ #if defined(GGML_SIMD)
223
+ const int np = (n & ~(GGML_F32_STEP - 1));
224
+
225
+ GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
226
+
227
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
228
+ vx[k] = GGML_F32_VEC_SET1(v[k][0]);
229
+ }
230
+
231
+ GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
232
+ GGML_F32_VEC ay[GGML_F32_ARR];
233
+
234
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
235
+ for (int j = 0; j < GGML_F32_ARR; j++) {
236
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
237
+
238
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
239
+ ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
240
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
241
+ }
242
+
243
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
244
+ }
245
+ }
246
+
247
+ // leftovers
248
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
249
+ for (int i = np; i < n; ++i) {
250
+ y[i] += x[k][i]*v[k][0];
251
+ }
252
+ }
253
+ #else
254
+ // scalar
255
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
256
+ for (int i = 0; i < n; ++i) {
257
+ y[i] += x[k][i]*v[k][0];
258
+ }
259
+ }
260
+ #endif
261
+ }
262
+
263
+ //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
264
+ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
265
+ #if defined(GGML_USE_ACCELERATE)
266
+ vDSP_vsmul(y, 1, &v, y, 1, n);
267
+ #elif defined(GGML_SIMD)
268
+ const int np = (n & ~(GGML_F32_STEP - 1));
269
+
270
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
271
+
272
+ GGML_F32_VEC ay[GGML_F32_ARR];
273
+
274
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
275
+ for (int j = 0; j < GGML_F32_ARR; j++) {
276
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
277
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
278
+
279
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
280
+ }
281
+ }
282
+
283
+ // leftovers
284
+ for (int i = np; i < n; ++i) {
285
+ y[i] *= v;
286
+ }
287
+ #else
288
+ // scalar
289
+ for (int i = 0; i < n; ++i) {
290
+ y[i] *= v;
291
+ }
292
+ #endif
293
+ }
294
+
295
+ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
296
+ #if defined(GGML_SIMD)
297
+ const int np = (n & ~(GGML_F16_STEP - 1));
298
+
299
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
300
+
301
+ GGML_F16_VEC ay[GGML_F16_ARR];
302
+
303
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
304
+ for (int j = 0; j < GGML_F16_ARR; j++) {
305
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
306
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
307
+
308
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
309
+ }
310
+ }
311
+
312
+ // leftovers
313
+ for (int i = np; i < n; ++i) {
314
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
315
+ }
316
+ #else
317
+ // scalar
318
+ for (int i = 0; i < n; ++i) {
319
+ y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
320
+ }
321
+ #endif
322
+ }
323
+
324
+ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
325
+ inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
326
+ inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
327
+ for (int i = 0; i < n; ++i) {
328
+ float v = GGML_FP16_TO_FP32(x[i]);
329
+ y[i] = GGML_FP32_TO_FP16(v*v);
330
+ }
331
+ }
332
+ inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
333
+ inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
334
+ for (int i = 0; i < n; ++i) {
335
+ y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
336
+ }
337
+ }
338
+ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
339
+ inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
340
+ for (int i = 0; i < n; ++i) {
341
+ y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
342
+ }
343
+ }
344
+ inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
345
+ inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
346
+ for (int i = 0; i < n; ++i) {
347
+ y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
348
+ }
349
+ }
350
+ inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
351
+ inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
352
+ for (int i = 0; i < n; ++i) {
353
+ y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
354
+ }
355
+ }
356
+ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
357
+ inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
358
+ for (int i = 0; i < n; ++i) {
359
+ y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
360
+ }
361
+ }
362
+ inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
363
+ inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
364
+ for (int i = 0; i < n; ++i) {
365
+ float v = GGML_FP16_TO_FP32(x[i]);
366
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
367
+ }
368
+ }
369
+ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
370
+ inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
371
+ for (int i = 0; i < n; ++i) {
372
+ y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
373
+ }
374
+ }
375
+ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
376
+ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
377
+ for (int i = 0; i < n; ++i) {
378
+ y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
379
+ }
380
+ }
381
+ inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
382
+ inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
383
+ for (int i = 0; i < n; ++i) {
384
+ y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
385
+ }
386
+ }
387
+ inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
388
+ inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
389
+ for (int i = 0; i < n; ++i) {
390
+ float v = GGML_FP16_TO_FP32(x[i]);
391
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
392
+ }
393
+ }
394
+ inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
395
+ inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
396
+ for (int i = 0; i < n; ++i) {
397
+ float v = GGML_FP16_TO_FP32(x[i]);
398
+ y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
399
+ }
400
+ }
401
+ inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
402
+ inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
403
+ for (int i = 0; i < n; ++i) {
404
+ y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
405
+ }
406
+ }
407
+ // TODO: optimize performance
408
+ inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
409
+ inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
410
+ for (int i = 0; i < n; ++i) {
411
+ float v = GGML_FP16_TO_FP32(x[i]);
412
+ y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
413
+ }
414
+ }
415
+ inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
416
+ inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
417
+ for (int i = 0; i < n; ++i) {
418
+ y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
419
+ }
420
+ }
421
+ inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
422
+ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
423
+ for (int i = 0; i < n; ++i) {
424
+ y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
425
+ }
426
+ }
427
+
428
+ static const float GELU_COEF_A = 0.044715f;
429
+ static const float GELU_QUICK_COEF = -1.702f;
430
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
431
+
432
+ inline static float ggml_gelu_f32(float x) {
433
+ return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
434
+ }
435
+
436
+ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
437
+ const uint16_t * i16 = (const uint16_t *) x;
438
+ for (int i = 0; i < n; ++i) {
439
+ y[i] = ggml_table_gelu_f16[i16[i]];
440
+ }
441
+ }
442
+
443
+ #ifdef GGML_GELU_FP16
444
+ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
445
+ uint16_t t;
446
+ for (int i = 0; i < n; ++i) {
447
+ if (x[i] <= -10.0f) {
448
+ y[i] = 0.0f;
449
+ } else if (x[i] >= 10.0f) {
450
+ y[i] = x[i];
451
+ } else {
452
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
453
+ memcpy(&t, &fp16, sizeof(uint16_t));
454
+ y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
455
+ }
456
+ }
457
+ }
458
+ #else
459
+ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
460
+ for (int i = 0; i < n; ++i) {
461
+ y[i] = ggml_gelu_f32(x[i]);
462
+ }
463
+ }
464
+ #endif
465
+
466
+ inline static float ggml_gelu_quick_f32(float x) {
467
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
468
+ }
469
+
470
+ //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
471
+ // const uint16_t * i16 = (const uint16_t *) x;
472
+ // for (int i = 0; i < n; ++i) {
473
+ // y[i] = ggml_table_gelu_quick_f16[i16[i]];
474
+ // }
475
+ //}
476
+
477
+ #ifdef GGML_GELU_QUICK_FP16
478
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
479
+ uint16_t t;
480
+ for (int i = 0; i < n; ++i) {
481
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
482
+ memcpy(&t, &fp16, sizeof(uint16_t));
483
+ y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
484
+ }
485
+ }
486
+ #else
487
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
488
+ for (int i = 0; i < n; ++i) {
489
+ y[i] = ggml_gelu_quick_f32(x[i]);
490
+ }
491
+ }
492
+ #endif
493
+
494
+ inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
495
+ for (int i = 0; i < n; ++i) {
496
+ float v = GGML_FP16_TO_FP32(x[i]);
497
+ y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
498
+ }
499
+ }
500
+
501
+ // Sigmoid Linear Unit (SiLU) function
502
+ inline static float ggml_silu_f32(float x) {
503
+ return x/(1.0f + expf(-x));
504
+ }
505
+ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
506
+ float v = GGML_FP16_TO_FP32(x);
507
+ return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
508
+ }
509
+
510
+ #if __FINITE_MATH_ONLY__
511
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
512
+ #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
513
+ #endif
514
+
515
+ #if defined(__ARM_NEON) && defined(__aarch64__)
516
+
517
+ // adapted from arm limited optimized routine
518
+ // the maximum error is 1.45358 plus 0.5 ulps
519
+ // numbers above 88.38 will flush to infinity
520
+ // numbers beneath -103.97 will flush to zero
521
+ inline static float32x4_t ggml_v_expf(float32x4_t x) {
522
+ const float32x4_t r = vdupq_n_f32(0x1.8p23f);
523
+ const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
524
+ const float32x4_t n = vsubq_f32(z, r);
525
+ const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
526
+ vdupq_n_f32(0x1.7f7d1cp-20f));
527
+ const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
528
+ const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
529
+ const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
530
+ const float32x4_t u = vmulq_f32(b, b);
531
+ const float32x4_t j = vfmaq_f32(
532
+ vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
533
+ vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
534
+ vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
535
+ if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
536
+ return vfmaq_f32(k, j, k);
537
+ const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
538
+ const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
539
+ const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
540
+ return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
541
+ vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
542
+ }
543
+
544
+ // computes silu x/(1+exp(-x)) in single precision vector
545
+ inline static float32x4_t ggml_v_silu(float32x4_t x) {
546
+ const float32x4_t one = vdupq_n_f32(1.0f);
547
+ const float32x4_t zero = vdupq_n_f32(0.0f);
548
+ const float32x4_t neg_x = vsubq_f32(zero, x);
549
+ const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
550
+ const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
551
+ return vdivq_f32(x, one_plus_exp_neg_x);
552
+ }
553
+
554
+ #elif defined(__AVX512F__) && defined(__AVX512DQ__)
555
+
556
+ // adapted from arm limited optimized routine
557
+ // the maximum error is 1.45358 plus 0.5 ulps
558
+ // numbers above 88.38 will flush to infinity
559
+ // numbers beneath -103.97 will flush to zero
560
+ inline static __m512 ggml_v_expf(__m512 x) {
561
+ const __m512 r = _mm512_set1_ps(0x1.8p23f);
562
+ const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
563
+ const __m512 n = _mm512_sub_ps(z, r);
564
+ const __m512 b =
565
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
566
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
567
+ const __mmask16 d =
568
+ _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
569
+ const __m512 u = _mm512_mul_ps(b, b);
570
+ const __m512 j = _mm512_fmadd_ps(
571
+ _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
572
+ _mm512_set1_ps(0x1.573e2ep-5f)),
573
+ u,
574
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
575
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
576
+ u,
577
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
578
+ const __m512 res = _mm512_scalef_ps(j, n);
579
+ if (_mm512_kortestz(d, d))
580
+ return res;
581
+ const __m512 zero = _mm512_setzero_ps();
582
+ const __m512 alt = _mm512_mask_blend_ps(
583
+ _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
584
+ return _mm512_mask_blend_ps(d, res, alt);
585
+ }
586
+
587
+ // computes silu x/(1+exp(-x)) in single precision vector
588
+ inline static __m512 ggml_v_silu(__m512 x) {
589
+ const __m512 one = _mm512_set1_ps(1);
590
+ const __m512 zero = _mm512_setzero_ps();
591
+ const __m512 neg_x = _mm512_sub_ps(zero, x);
592
+ const __m512 exp_neg_x = ggml_v_expf(neg_x);
593
+ const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
594
+ return _mm512_div_ps(x, one_plus_exp_neg_x);
595
+ }
596
+
597
+ #elif defined(__AVX2__) && defined(__FMA__)
598
+
599
+ // adapted from arm limited optimized routine
600
+ // the maximum error is 1.45358 plus 0.5 ulps
601
+ // numbers above 88.38 will flush to infinity
602
+ // numbers beneath -103.97 will flush to zero
603
+ inline static __m256 ggml_v_expf(__m256 x) {
604
+ const __m256 r = _mm256_set1_ps(0x1.8p23f);
605
+ const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
606
+ const __m256 n = _mm256_sub_ps(z, r);
607
+ const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
608
+ _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
609
+ const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
610
+ const __m256 k = _mm256_castsi256_ps(
611
+ _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
612
+ const __m256i c = _mm256_castps_si256(
613
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
614
+ _mm256_set1_ps(126), _CMP_GT_OQ));
615
+ const __m256 u = _mm256_mul_ps(b, b);
616
+ const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
617
+ _mm256_set1_ps(0x1.573e2ep-5f)), u,
618
+ _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
619
+ _mm256_set1_ps(0x1.fffdb6p-2f))),
620
+ u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
621
+ if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
622
+ return _mm256_fmadd_ps(j, k, k);
623
+ const __m256i g = _mm256_and_si256(
624
+ _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
625
+ _mm256_set1_epi32(0x82000000u));
626
+ const __m256 s1 =
627
+ _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
628
+ const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
629
+ const __m256i d = _mm256_castps_si256(
630
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
631
+ _mm256_set1_ps(192), _CMP_GT_OQ));
632
+ return _mm256_or_ps(
633
+ _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
634
+ _mm256_andnot_ps(
635
+ _mm256_castsi256_ps(d),
636
+ _mm256_or_ps(
637
+ _mm256_and_ps(_mm256_castsi256_ps(c),
638
+ _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
639
+ _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
640
+ }
641
+
642
+ // computes silu x/(1+exp(-x)) in single precision vector
643
+ inline static __m256 ggml_v_silu(__m256 x) {
644
+ const __m256 one = _mm256_set1_ps(1);
645
+ const __m256 zero = _mm256_setzero_ps();
646
+ const __m256 neg_x = _mm256_sub_ps(zero, x);
647
+ const __m256 exp_neg_x = ggml_v_expf(neg_x);
648
+ const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
649
+ return _mm256_div_ps(x, one_plus_exp_neg_x);
650
+ }
651
+
652
+ #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
653
+
654
+ #if defined(__FMA__)
655
+ #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
656
+ #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
657
+ #else
658
+ #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
659
+ #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
660
+ #endif
661
+
662
+ // adapted from arm limited optimized routine
663
+ // the maximum error is 1.45358 plus 0.5 ulps
664
+ // numbers above 88.38 will flush to infinity
665
+ // numbers beneath -103.97 will flush to zero
666
+ inline static __m128 ggml_v_expf(__m128 x) {
667
+ const __m128 r = _mm_set1_ps(0x1.8p23f);
668
+ const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
669
+ const __m128 n = _mm_sub_ps(z, r);
670
+ const __m128 b =
671
+ NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
672
+ const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
673
+ const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
674
+ const __m128i c =
675
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
676
+ const __m128 u = _mm_mul_ps(b, b);
677
+ const __m128 j =
678
+ MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
679
+ MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
680
+ u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
681
+ if (!_mm_movemask_epi8(c))
682
+ return MADD128(j, k, k);
683
+ const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
684
+ _mm_set1_epi32(0x82000000u));
685
+ const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
686
+ const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
687
+ const __m128i d =
688
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
689
+ return _mm_or_ps(
690
+ _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
691
+ _mm_andnot_ps(_mm_castsi128_ps(d),
692
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
693
+ _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
694
+ }
695
+
696
+ // computes silu x/(1+exp(-x)) in single precision vector
697
+ inline static __m128 ggml_v_silu(__m128 x) {
698
+ const __m128 one = _mm_set1_ps(1);
699
+ const __m128 zero = _mm_setzero_ps();
700
+ const __m128 neg_x = _mm_sub_ps(zero, x);
701
+ const __m128 exp_neg_x = ggml_v_expf(neg_x);
702
+ const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
703
+ return _mm_div_ps(x, one_plus_exp_neg_x);
704
+ }
705
+
706
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__
707
+
708
+ inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
709
+ for (int i = 0; i < n; ++i) {
710
+ y[i] = ggml_silu_f16(x[i]);
711
+ }
712
+ }
713
+
714
+ inline static float ggml_silu_backward_f32(float x, float dy) {
715
+ const float s = 1.0f/(1.0f + expf(-x));
716
+ return dy*s*(1.0f + x*(1.0f - s));
717
+ }
718
+
719
+ inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
720
+ const float v = GGML_FP16_TO_FP32(x);
721
+ const float s = 1.0f/(1.0f + expf(-v));
722
+ return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
723
+ }
724
+
725
+ inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
726
+ for (int i = 0; i < n; ++i) {
727
+ dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
728
+ }
729
+ }
730
+
731
+ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
732
+ for (int i = 0; i < n; ++i) {
733
+ dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
734
+ }
735
+ }
736
+
737
+ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
738
+ #ifndef GGML_USE_ACCELERATE
739
+ ggml_float sum = 0.0;
740
+ for (int i = 0; i < n; ++i) {
741
+ sum += (ggml_float)x[i];
742
+ }
743
+ *s = (float)sum;
744
+ #else
745
+ vDSP_sve(x, 1, s, n);
746
+ #endif
747
+ }
748
+
749
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
750
+ ggml_float sum = 0.0;
751
+ for (int i = 0; i < n; ++i) {
752
+ sum += (ggml_float)x[i];
753
+ }
754
+ *s = sum;
755
+ }
756
+
757
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
758
+ float sum = 0.0f;
759
+ for (int i = 0; i < n; ++i) {
760
+ sum += GGML_FP16_TO_FP32(x[i]);
761
+ }
762
+ *s = sum;
763
+ }
764
+
765
+ inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
766
+ float sum = 0.0f;
767
+ for (int i = 0; i < n; ++i) {
768
+ sum += GGML_BF16_TO_FP32(x[i]);
769
+ }
770
+ *s = sum;
771
+ }
772
+
773
+ inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
774
+ #ifndef GGML_USE_ACCELERATE
775
+ float max = -INFINITY;
776
+ for (int i = 0; i < n; ++i) {
777
+ max = MAX(max, x[i]);
778
+ }
779
+ *s = max;
780
+ #else
781
+ vDSP_maxv(x, 1, s, n);
782
+ #endif
783
+ }
784
+
785
+ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
786
+ ggml_vec_norm_f32(n, s, x);
787
+ *s = 1.f/(*s);
788
+ }
789
+
790
+ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
791
+ float max = -INFINITY;
792
+ int idx = 0;
793
+ for (int i = 0; i < n; ++i) {
794
+ max = MAX(max, x[i]);
795
+ if (max == x[i]) { idx = i; }
796
+ }
797
+ *s = idx;
798
+ }
799
+
800
+ #ifdef __cplusplus
801
+ }
802
+ #endif