Spaces:
Running
Running
cmdr2
Diego Devesa
commited on
Commit
·
0754d43
1
Parent(s):
6688a2b
cpu: move all the operators into a separate c++ file (except mul_mat) (ggml/1167)
Browse files* cpu: refactor SIMD mappings and vectorized op functions into separate files
* Fix warning for ggml_float to float
* Fix warnings
* cpu: move all the operations (except mul_mat) to a separate c++ file
* fix whitespace
* Update ggml/src/ggml-cpu/vec.h
Co-authored-by: Diego Devesa <[email protected]>
* Fix PR comments - use GGML_UNUSED, use cassert in ops.cpp
* Reverse the order of import for ops.h and vec.h, to match what was present in ggml-cpu.c previously
---------
Co-authored-by: Diego Devesa <[email protected]>
- ggml/src/ggml-cpu/CMakeLists.txt +5 -0
- ggml/src/ggml-cpu/ggml-cpu.c +0 -0
- ggml/src/ggml-cpu/ops.cpp +0 -0
- ggml/src/ggml-cpu/ops.h +128 -0
- ggml/src/ggml-cpu/simd-mappings.h +884 -0
- ggml/src/ggml-cpu/vec.cpp +258 -0
- ggml/src/ggml-cpu/vec.h +802 -0
ggml/src/ggml-cpu/CMakeLists.txt
CHANGED
|
@@ -28,6 +28,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
| 28 |
ggml-cpu/binary-ops.cpp
|
| 29 |
ggml-cpu/unary-ops.h
|
| 30 |
ggml-cpu/unary-ops.cpp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
|
| 33 |
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
|
|
|
|
| 28 |
ggml-cpu/binary-ops.cpp
|
| 29 |
ggml-cpu/unary-ops.h
|
| 30 |
ggml-cpu/unary-ops.cpp
|
| 31 |
+
ggml-cpu/simd-mappings.h
|
| 32 |
+
ggml-cpu/vec.h
|
| 33 |
+
ggml-cpu/vec.cpp
|
| 34 |
+
ggml-cpu/ops.h
|
| 35 |
+
ggml-cpu/ops.cpp
|
| 36 |
)
|
| 37 |
|
| 38 |
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
|
ggml/src/ggml-cpu/ggml-cpu.c
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml/src/ggml-cpu/ops.cpp
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ggml/src/ggml-cpu/ops.h
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "ggml.h"
|
| 4 |
+
|
| 5 |
+
//
|
| 6 |
+
// cache line
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
#if defined(__cpp_lib_hardware_interference_size)
|
| 10 |
+
#define CACHE_LINE_SIZE std::hardware_destructive_interference_size
|
| 11 |
+
#else
|
| 12 |
+
#if defined(__POWER9_VECTOR__)
|
| 13 |
+
#define CACHE_LINE_SIZE 128
|
| 14 |
+
#elif defined(__VXE__) || defined(__VXE2__)
|
| 15 |
+
#define CACHE_LINE_SIZE 256
|
| 16 |
+
#else
|
| 17 |
+
#define CACHE_LINE_SIZE 64
|
| 18 |
+
#endif
|
| 19 |
+
#endif
|
| 20 |
+
|
| 21 |
+
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
| 22 |
+
|
| 23 |
+
#ifdef __cplusplus
|
| 24 |
+
extern "C" {
|
| 25 |
+
#endif
|
| 26 |
+
|
| 27 |
+
void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 28 |
+
void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 29 |
+
void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 30 |
+
void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 31 |
+
void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 32 |
+
void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 33 |
+
void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 34 |
+
void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 35 |
+
void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 36 |
+
void ggml_compute_forward_repeat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 37 |
+
void ggml_compute_forward_repeat_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 38 |
+
void ggml_compute_forward_concat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 39 |
+
void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 40 |
+
void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 41 |
+
void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 42 |
+
void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 43 |
+
void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 44 |
+
void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 45 |
+
void ggml_compute_forward_out_prod(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 46 |
+
void ggml_compute_forward_scale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 47 |
+
void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 48 |
+
void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 49 |
+
void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 50 |
+
void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 51 |
+
void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 52 |
+
void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 53 |
+
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 54 |
+
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 55 |
+
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 56 |
+
void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 57 |
+
void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 58 |
+
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 59 |
+
void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 60 |
+
void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 61 |
+
void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 62 |
+
void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 63 |
+
void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 64 |
+
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 65 |
+
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 66 |
+
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 67 |
+
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 68 |
+
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 69 |
+
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 70 |
+
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 71 |
+
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 72 |
+
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 73 |
+
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 74 |
+
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 75 |
+
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 76 |
+
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 77 |
+
void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 78 |
+
void ggml_compute_forward_flash_attn_ext(
|
| 79 |
+
const struct ggml_compute_params * params,
|
| 80 |
+
const struct ggml_tensor * q,
|
| 81 |
+
const struct ggml_tensor * k,
|
| 82 |
+
const struct ggml_tensor * v,
|
| 83 |
+
const struct ggml_tensor * mask,
|
| 84 |
+
struct ggml_tensor * dst);
|
| 85 |
+
void ggml_compute_forward_flash_attn_back(
|
| 86 |
+
const struct ggml_compute_params * params,
|
| 87 |
+
const bool masked,
|
| 88 |
+
struct ggml_tensor * dst);
|
| 89 |
+
void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 90 |
+
void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 91 |
+
void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 92 |
+
void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 93 |
+
void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 94 |
+
void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 95 |
+
void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 96 |
+
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 97 |
+
void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 98 |
+
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 99 |
+
void ggml_compute_forward_map_unary(
|
| 100 |
+
const struct ggml_compute_params * params,
|
| 101 |
+
struct ggml_tensor * dst,
|
| 102 |
+
const ggml_unary_op_f32_t fun);
|
| 103 |
+
void ggml_compute_forward_map_binary(
|
| 104 |
+
const struct ggml_compute_params * params,
|
| 105 |
+
struct ggml_tensor * dst,
|
| 106 |
+
const ggml_binary_op_f32_t fun);
|
| 107 |
+
void ggml_compute_forward_map_custom1_f32(
|
| 108 |
+
const struct ggml_compute_params * params,
|
| 109 |
+
struct ggml_tensor * dst,
|
| 110 |
+
const ggml_custom1_op_f32_t fun);
|
| 111 |
+
void ggml_compute_forward_map_custom2_f32(
|
| 112 |
+
const struct ggml_compute_params * params,
|
| 113 |
+
struct ggml_tensor * dst,
|
| 114 |
+
const ggml_custom2_op_f32_t fun);
|
| 115 |
+
void ggml_compute_forward_map_custom3_f32(
|
| 116 |
+
const struct ggml_compute_params * params,
|
| 117 |
+
struct ggml_tensor * dst,
|
| 118 |
+
const ggml_custom3_op_f32_t fun);
|
| 119 |
+
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 120 |
+
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 121 |
+
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 122 |
+
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 123 |
+
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 124 |
+
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
| 125 |
+
|
| 126 |
+
#ifdef __cplusplus
|
| 127 |
+
}
|
| 128 |
+
#endif
|
ggml/src/ggml-cpu/simd-mappings.h
ADDED
|
@@ -0,0 +1,884 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "ggml-cpu-impl.h"
|
| 4 |
+
|
| 5 |
+
//
|
| 6 |
+
// simd mappings
|
| 7 |
+
//
|
| 8 |
+
|
| 9 |
+
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
| 10 |
+
// we then implement the fundamental computation operations below using only these macros
|
| 11 |
+
// adding support for new architectures requires to define the corresponding SIMD macros
|
| 12 |
+
//
|
| 13 |
+
// GGML_F32_STEP / GGML_F16_STEP
|
| 14 |
+
// number of elements to process in a single step
|
| 15 |
+
//
|
| 16 |
+
// GGML_F32_EPR / GGML_F16_EPR
|
| 17 |
+
// number of elements to fit in a single register
|
| 18 |
+
//
|
| 19 |
+
|
| 20 |
+
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
| 21 |
+
|
| 22 |
+
#define GGML_SIMD
|
| 23 |
+
|
| 24 |
+
// F32 NEON
|
| 25 |
+
|
| 26 |
+
#define GGML_F32_STEP 16
|
| 27 |
+
#define GGML_F32_EPR 4
|
| 28 |
+
|
| 29 |
+
#define GGML_F32x4 float32x4_t
|
| 30 |
+
#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
|
| 31 |
+
#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
|
| 32 |
+
#define GGML_F32x4_LOAD vld1q_f32
|
| 33 |
+
#define GGML_F32x4_STORE vst1q_f32
|
| 34 |
+
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 35 |
+
#define GGML_F32x4_ADD vaddq_f32
|
| 36 |
+
#define GGML_F32x4_MUL vmulq_f32
|
| 37 |
+
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
| 38 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 39 |
+
{ \
|
| 40 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 41 |
+
for (int i = 0; i < offset; ++i) { \
|
| 42 |
+
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
| 43 |
+
} \
|
| 44 |
+
offset >>= 1; \
|
| 45 |
+
for (int i = 0; i < offset; ++i) { \
|
| 46 |
+
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
| 47 |
+
} \
|
| 48 |
+
offset >>= 1; \
|
| 49 |
+
for (int i = 0; i < offset; ++i) { \
|
| 50 |
+
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
| 51 |
+
} \
|
| 52 |
+
(res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 56 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 57 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 58 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 59 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 60 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 61 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 62 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 63 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 64 |
+
|
| 65 |
+
// F16 NEON
|
| 66 |
+
|
| 67 |
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
| 68 |
+
#define GGML_F16_STEP 32
|
| 69 |
+
#define GGML_F16_EPR 8
|
| 70 |
+
|
| 71 |
+
#define GGML_F16x8 float16x8_t
|
| 72 |
+
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
| 73 |
+
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
| 74 |
+
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
|
| 75 |
+
#define GGML_F16x8_STORE vst1q_f16
|
| 76 |
+
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
| 77 |
+
#define GGML_F16x8_ADD vaddq_f16
|
| 78 |
+
#define GGML_F16x8_MUL vmulq_f16
|
| 79 |
+
#define GGML_F16x8_REDUCE(res, x) \
|
| 80 |
+
do { \
|
| 81 |
+
int offset = GGML_F16_ARR >> 1; \
|
| 82 |
+
for (int i = 0; i < offset; ++i) { \
|
| 83 |
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
| 84 |
+
} \
|
| 85 |
+
offset >>= 1; \
|
| 86 |
+
for (int i = 0; i < offset; ++i) { \
|
| 87 |
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
| 88 |
+
} \
|
| 89 |
+
offset >>= 1; \
|
| 90 |
+
for (int i = 0; i < offset; ++i) { \
|
| 91 |
+
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
| 92 |
+
} \
|
| 93 |
+
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
| 94 |
+
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
| 95 |
+
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
| 96 |
+
} while (0)
|
| 97 |
+
|
| 98 |
+
#define GGML_F16_VEC GGML_F16x8
|
| 99 |
+
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
| 100 |
+
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
| 101 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
| 102 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
| 103 |
+
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
| 104 |
+
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
| 105 |
+
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
| 106 |
+
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
| 107 |
+
#else
|
| 108 |
+
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
| 109 |
+
// and take advantage of the vcvt_ functions to convert to/from FP16
|
| 110 |
+
|
| 111 |
+
#define GGML_F16_STEP 16
|
| 112 |
+
#define GGML_F16_EPR 4
|
| 113 |
+
|
| 114 |
+
#define GGML_F32Cx4 float32x4_t
|
| 115 |
+
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
| 116 |
+
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
| 117 |
+
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
|
| 118 |
+
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
| 119 |
+
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
| 120 |
+
#define GGML_F32Cx4_ADD vaddq_f32
|
| 121 |
+
#define GGML_F32Cx4_MUL vmulq_f32
|
| 122 |
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
| 123 |
+
|
| 124 |
+
#define GGML_F16_VEC GGML_F32Cx4
|
| 125 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
| 126 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
| 127 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
| 128 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
|
| 129 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
| 130 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
| 131 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
| 132 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
| 133 |
+
#endif
|
| 134 |
+
|
| 135 |
+
#elif defined(__AVX512F__)
|
| 136 |
+
|
| 137 |
+
#define GGML_SIMD
|
| 138 |
+
|
| 139 |
+
// F32 AVX512
|
| 140 |
+
|
| 141 |
+
#define GGML_F32_STEP 64
|
| 142 |
+
#define GGML_F32_EPR 16
|
| 143 |
+
|
| 144 |
+
#define GGML_F32x16 __m512
|
| 145 |
+
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
| 146 |
+
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
| 147 |
+
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
| 148 |
+
#define GGML_F32x16_STORE _mm512_storeu_ps
|
| 149 |
+
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
| 150 |
+
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
| 151 |
+
#define GGML_F32x16_ADD _mm512_add_ps
|
| 152 |
+
#define GGML_F32x16_MUL _mm512_mul_ps
|
| 153 |
+
#define GGML_F32x16_REDUCE(res, x) \
|
| 154 |
+
do { \
|
| 155 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 156 |
+
for (int i = 0; i < offset; ++i) { \
|
| 157 |
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 158 |
+
} \
|
| 159 |
+
offset >>= 1; \
|
| 160 |
+
for (int i = 0; i < offset; ++i) { \
|
| 161 |
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 162 |
+
} \
|
| 163 |
+
offset >>= 1; \
|
| 164 |
+
for (int i = 0; i < offset; ++i) { \
|
| 165 |
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 166 |
+
} \
|
| 167 |
+
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
| 168 |
+
} while (0)
|
| 169 |
+
|
| 170 |
+
// TODO: is this optimal ?
|
| 171 |
+
|
| 172 |
+
#define GGML_F32_VEC GGML_F32x16
|
| 173 |
+
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
| 174 |
+
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
| 175 |
+
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
| 176 |
+
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
| 177 |
+
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
| 178 |
+
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
| 179 |
+
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
| 180 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
| 181 |
+
|
| 182 |
+
// F16 AVX512
|
| 183 |
+
|
| 184 |
+
// F16 AVX
|
| 185 |
+
|
| 186 |
+
#define GGML_F16_STEP 64
|
| 187 |
+
#define GGML_F16_EPR 16
|
| 188 |
+
|
| 189 |
+
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
| 190 |
+
|
| 191 |
+
#define GGML_F32Cx16 __m512
|
| 192 |
+
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
| 193 |
+
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
| 194 |
+
|
| 195 |
+
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
| 196 |
+
// so F16C guard isn't required
|
| 197 |
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
|
| 198 |
+
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
| 199 |
+
|
| 200 |
+
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
| 201 |
+
#define GGML_F32Cx16_ADD _mm512_add_ps
|
| 202 |
+
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
| 203 |
+
#define GGML_F32Cx16_REDUCE(res, x) \
|
| 204 |
+
do { \
|
| 205 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 206 |
+
for (int i = 0; i < offset; ++i) { \
|
| 207 |
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 208 |
+
} \
|
| 209 |
+
offset >>= 1; \
|
| 210 |
+
for (int i = 0; i < offset; ++i) { \
|
| 211 |
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 212 |
+
} \
|
| 213 |
+
offset >>= 1; \
|
| 214 |
+
for (int i = 0; i < offset; ++i) { \
|
| 215 |
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
| 216 |
+
} \
|
| 217 |
+
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
| 218 |
+
} while (0)
|
| 219 |
+
|
| 220 |
+
#define GGML_F16_VEC GGML_F32Cx16
|
| 221 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
| 222 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
| 223 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
| 224 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
| 225 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
| 226 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
| 227 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
| 228 |
+
|
| 229 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
| 230 |
+
#elif defined(__AVX__)
|
| 231 |
+
|
| 232 |
+
#define GGML_SIMD
|
| 233 |
+
|
| 234 |
+
// F32 AVX
|
| 235 |
+
|
| 236 |
+
#define GGML_F32_STEP 32
|
| 237 |
+
#define GGML_F32_EPR 8
|
| 238 |
+
|
| 239 |
+
#define GGML_F32x8 __m256
|
| 240 |
+
#define GGML_F32x8_ZERO _mm256_setzero_ps()
|
| 241 |
+
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
|
| 242 |
+
#define GGML_F32x8_LOAD _mm256_loadu_ps
|
| 243 |
+
#define GGML_F32x8_STORE _mm256_storeu_ps
|
| 244 |
+
#if defined(__FMA__)
|
| 245 |
+
#define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
|
| 246 |
+
#else
|
| 247 |
+
#define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
|
| 248 |
+
#endif
|
| 249 |
+
#define GGML_F32x8_ADD _mm256_add_ps
|
| 250 |
+
#define GGML_F32x8_MUL _mm256_mul_ps
|
| 251 |
+
#define GGML_F32x8_REDUCE(res, x) \
|
| 252 |
+
do { \
|
| 253 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 254 |
+
for (int i = 0; i < offset; ++i) { \
|
| 255 |
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
| 256 |
+
} \
|
| 257 |
+
offset >>= 1; \
|
| 258 |
+
for (int i = 0; i < offset; ++i) { \
|
| 259 |
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
| 260 |
+
} \
|
| 261 |
+
offset >>= 1; \
|
| 262 |
+
for (int i = 0; i < offset; ++i) { \
|
| 263 |
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
| 264 |
+
} \
|
| 265 |
+
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
| 266 |
+
_mm256_extractf128_ps(x[0], 1)); \
|
| 267 |
+
const __m128 t1 = _mm_hadd_ps(t0, t0); \
|
| 268 |
+
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
|
| 269 |
+
} while (0)
|
| 270 |
+
// TODO: is this optimal ?
|
| 271 |
+
|
| 272 |
+
#define GGML_F32_VEC GGML_F32x8
|
| 273 |
+
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
| 274 |
+
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
| 275 |
+
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
| 276 |
+
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
| 277 |
+
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
| 278 |
+
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
| 279 |
+
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
| 280 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
| 281 |
+
|
| 282 |
+
// F16 AVX
|
| 283 |
+
|
| 284 |
+
#define GGML_F16_STEP 32
|
| 285 |
+
#define GGML_F16_EPR 8
|
| 286 |
+
|
| 287 |
+
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
| 288 |
+
|
| 289 |
+
#define GGML_F32Cx8 __m256
|
| 290 |
+
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
|
| 291 |
+
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
|
| 292 |
+
|
| 293 |
+
#if defined(__F16C__)
|
| 294 |
+
// the _mm256_cvt intrinsics require F16C
|
| 295 |
+
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
| 296 |
+
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
| 297 |
+
#else
|
| 298 |
+
static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
| 299 |
+
float tmp[8];
|
| 300 |
+
|
| 301 |
+
for (int i = 0; i < 8; i++) {
|
| 302 |
+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
return _mm256_loadu_ps(tmp);
|
| 306 |
+
}
|
| 307 |
+
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
| 308 |
+
float arr[8];
|
| 309 |
+
|
| 310 |
+
_mm256_storeu_ps(arr, y);
|
| 311 |
+
|
| 312 |
+
for (int i = 0; i < 8; i++)
|
| 313 |
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
| 314 |
+
}
|
| 315 |
+
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
| 316 |
+
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
| 317 |
+
#endif
|
| 318 |
+
|
| 319 |
+
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
| 320 |
+
#define GGML_F32Cx8_ADD _mm256_add_ps
|
| 321 |
+
#define GGML_F32Cx8_MUL _mm256_mul_ps
|
| 322 |
+
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
| 323 |
+
|
| 324 |
+
#define GGML_F16_VEC GGML_F32Cx8
|
| 325 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
| 326 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
| 327 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
| 328 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
| 329 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
| 330 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
| 331 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
| 332 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
| 333 |
+
|
| 334 |
+
#elif defined(__POWER9_VECTOR__)
|
| 335 |
+
|
| 336 |
+
#define GGML_SIMD
|
| 337 |
+
|
| 338 |
+
// F32 POWER9
|
| 339 |
+
|
| 340 |
+
#define GGML_F32_STEP 32
|
| 341 |
+
#define GGML_F32_EPR 4
|
| 342 |
+
|
| 343 |
+
#define GGML_F32x4 vector float
|
| 344 |
+
#define GGML_F32x4_ZERO 0.0f
|
| 345 |
+
#define GGML_F32x4_SET1 vec_splats
|
| 346 |
+
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
| 347 |
+
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
| 348 |
+
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
| 349 |
+
#define GGML_F32x4_ADD vec_add
|
| 350 |
+
#define GGML_F32x4_MUL vec_mul
|
| 351 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 352 |
+
{ \
|
| 353 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 354 |
+
for (int i = 0; i < offset; ++i) { \
|
| 355 |
+
x[i] = vec_add(x[i], x[offset+i]); \
|
| 356 |
+
} \
|
| 357 |
+
offset >>= 1; \
|
| 358 |
+
for (int i = 0; i < offset; ++i) { \
|
| 359 |
+
x[i] = vec_add(x[i], x[offset+i]); \
|
| 360 |
+
} \
|
| 361 |
+
offset >>= 1; \
|
| 362 |
+
for (int i = 0; i < offset; ++i) { \
|
| 363 |
+
x[i] = vec_add(x[i], x[offset+i]); \
|
| 364 |
+
} \
|
| 365 |
+
res = vec_extract(x[0], 0) + \
|
| 366 |
+
vec_extract(x[0], 1) + \
|
| 367 |
+
vec_extract(x[0], 2) + \
|
| 368 |
+
vec_extract(x[0], 3); \
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 372 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 373 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 374 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 375 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 376 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 377 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 378 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 379 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 380 |
+
|
| 381 |
+
// F16 POWER9
|
| 382 |
+
#define GGML_F16_STEP GGML_F32_STEP
|
| 383 |
+
#define GGML_F16_EPR GGML_F32_EPR
|
| 384 |
+
#define GGML_F16_VEC GGML_F32x4
|
| 385 |
+
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
| 386 |
+
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
| 387 |
+
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
| 388 |
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
| 389 |
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
| 390 |
+
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
| 391 |
+
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
| 392 |
+
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
| 393 |
+
vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
|
| 394 |
+
vec_extract_fp32_from_shortl(vec_xl(0, p))
|
| 395 |
+
#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
|
| 396 |
+
#define GGML_F16_VEC_STORE(p, r, i) \
|
| 397 |
+
if (i & 0x1) \
|
| 398 |
+
vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
|
| 399 |
+
r[i - GGML_ENDIAN_BYTE(0)]), \
|
| 400 |
+
0, p - GGML_F16_EPR)
|
| 401 |
+
|
| 402 |
+
#elif defined(__wasm_simd128__)
|
| 403 |
+
|
| 404 |
+
#define GGML_SIMD
|
| 405 |
+
|
| 406 |
+
// F32 WASM
|
| 407 |
+
|
| 408 |
+
#define GGML_F32_STEP 16
|
| 409 |
+
#define GGML_F32_EPR 4
|
| 410 |
+
|
| 411 |
+
#define GGML_F32x4 v128_t
|
| 412 |
+
#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
|
| 413 |
+
#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
|
| 414 |
+
#define GGML_F32x4_LOAD wasm_v128_load
|
| 415 |
+
#define GGML_F32x4_STORE wasm_v128_store
|
| 416 |
+
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
|
| 417 |
+
#define GGML_F32x4_ADD wasm_f32x4_add
|
| 418 |
+
#define GGML_F32x4_MUL wasm_f32x4_mul
|
| 419 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 420 |
+
{ \
|
| 421 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 422 |
+
for (int i = 0; i < offset; ++i) { \
|
| 423 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
| 424 |
+
} \
|
| 425 |
+
offset >>= 1; \
|
| 426 |
+
for (int i = 0; i < offset; ++i) { \
|
| 427 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
| 428 |
+
} \
|
| 429 |
+
offset >>= 1; \
|
| 430 |
+
for (int i = 0; i < offset; ++i) { \
|
| 431 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
| 432 |
+
} \
|
| 433 |
+
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
| 434 |
+
wasm_f32x4_extract_lane(x[0], 1) + \
|
| 435 |
+
wasm_f32x4_extract_lane(x[0], 2) + \
|
| 436 |
+
wasm_f32x4_extract_lane(x[0], 3); \
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 440 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 441 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 442 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 443 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 444 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 445 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 446 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 447 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 448 |
+
|
| 449 |
+
// F16 WASM
|
| 450 |
+
|
| 451 |
+
#define GGML_F16_STEP 16
|
| 452 |
+
#define GGML_F16_EPR 4
|
| 453 |
+
|
| 454 |
+
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
| 455 |
+
float tmp[4];
|
| 456 |
+
|
| 457 |
+
tmp[0] = GGML_FP16_TO_FP32(p[0]);
|
| 458 |
+
tmp[1] = GGML_FP16_TO_FP32(p[1]);
|
| 459 |
+
tmp[2] = GGML_FP16_TO_FP32(p[2]);
|
| 460 |
+
tmp[3] = GGML_FP16_TO_FP32(p[3]);
|
| 461 |
+
|
| 462 |
+
return wasm_v128_load(tmp);
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
| 466 |
+
float tmp[4];
|
| 467 |
+
|
| 468 |
+
wasm_v128_store(tmp, x);
|
| 469 |
+
|
| 470 |
+
p[0] = GGML_FP32_TO_FP16(tmp[0]);
|
| 471 |
+
p[1] = GGML_FP32_TO_FP16(tmp[1]);
|
| 472 |
+
p[2] = GGML_FP32_TO_FP16(tmp[2]);
|
| 473 |
+
p[3] = GGML_FP32_TO_FP16(tmp[3]);
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
#define GGML_F16x4 v128_t
|
| 477 |
+
#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
|
| 478 |
+
#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
|
| 479 |
+
#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
|
| 480 |
+
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
|
| 481 |
+
#define GGML_F16x4_FMA GGML_F32x4_FMA
|
| 482 |
+
#define GGML_F16x4_ADD wasm_f32x4_add
|
| 483 |
+
#define GGML_F16x4_MUL wasm_f32x4_mul
|
| 484 |
+
#define GGML_F16x4_REDUCE(res, x) \
|
| 485 |
+
{ \
|
| 486 |
+
int offset = GGML_F16_ARR >> 1; \
|
| 487 |
+
for (int i = 0; i < offset; ++i) { \
|
| 488 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
| 489 |
+
} \
|
| 490 |
+
offset >>= 1; \
|
| 491 |
+
for (int i = 0; i < offset; ++i) { \
|
| 492 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
| 493 |
+
} \
|
| 494 |
+
offset >>= 1; \
|
| 495 |
+
for (int i = 0; i < offset; ++i) { \
|
| 496 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
| 497 |
+
} \
|
| 498 |
+
res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
|
| 499 |
+
wasm_f32x4_extract_lane(x[0], 1) + \
|
| 500 |
+
wasm_f32x4_extract_lane(x[0], 2) + \
|
| 501 |
+
wasm_f32x4_extract_lane(x[0], 3)); \
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
#define GGML_F16_VEC GGML_F16x4
|
| 505 |
+
#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
|
| 506 |
+
#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
|
| 507 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
|
| 508 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
|
| 509 |
+
#define GGML_F16_VEC_FMA GGML_F16x4_FMA
|
| 510 |
+
#define GGML_F16_VEC_ADD GGML_F16x4_ADD
|
| 511 |
+
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
| 512 |
+
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
| 513 |
+
|
| 514 |
+
#elif defined(__SSE3__)
|
| 515 |
+
|
| 516 |
+
#define GGML_SIMD
|
| 517 |
+
|
| 518 |
+
// F32 SSE
|
| 519 |
+
|
| 520 |
+
#define GGML_F32_STEP 32
|
| 521 |
+
#define GGML_F32_EPR 4
|
| 522 |
+
|
| 523 |
+
#define GGML_F32x4 __m128
|
| 524 |
+
#define GGML_F32x4_ZERO _mm_setzero_ps()
|
| 525 |
+
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
|
| 526 |
+
#define GGML_F32x4_LOAD _mm_loadu_ps
|
| 527 |
+
#define GGML_F32x4_STORE _mm_storeu_ps
|
| 528 |
+
#if defined(__FMA__)
|
| 529 |
+
// TODO: Does this work?
|
| 530 |
+
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
|
| 531 |
+
#else
|
| 532 |
+
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
|
| 533 |
+
#endif
|
| 534 |
+
#define GGML_F32x4_ADD _mm_add_ps
|
| 535 |
+
#define GGML_F32x4_MUL _mm_mul_ps
|
| 536 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 537 |
+
{ \
|
| 538 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 539 |
+
for (int i = 0; i < offset; ++i) { \
|
| 540 |
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
| 541 |
+
} \
|
| 542 |
+
offset >>= 1; \
|
| 543 |
+
for (int i = 0; i < offset; ++i) { \
|
| 544 |
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
| 545 |
+
} \
|
| 546 |
+
offset >>= 1; \
|
| 547 |
+
for (int i = 0; i < offset; ++i) { \
|
| 548 |
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
| 549 |
+
} \
|
| 550 |
+
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
| 551 |
+
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
| 552 |
+
}
|
| 553 |
+
// TODO: is this optimal ?
|
| 554 |
+
|
| 555 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 556 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 557 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 558 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 559 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 560 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 561 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 562 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 563 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 564 |
+
|
| 565 |
+
// F16 SSE
|
| 566 |
+
|
| 567 |
+
#define GGML_F16_STEP 32
|
| 568 |
+
#define GGML_F16_EPR 4
|
| 569 |
+
|
| 570 |
+
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
| 571 |
+
float tmp[4];
|
| 572 |
+
|
| 573 |
+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
| 574 |
+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
| 575 |
+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
| 576 |
+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
| 577 |
+
|
| 578 |
+
return _mm_loadu_ps(tmp);
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
| 582 |
+
float arr[4];
|
| 583 |
+
|
| 584 |
+
_mm_storeu_ps(arr, y);
|
| 585 |
+
|
| 586 |
+
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
| 587 |
+
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
| 588 |
+
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
| 589 |
+
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
#define GGML_F32Cx4 __m128
|
| 593 |
+
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
|
| 594 |
+
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
|
| 595 |
+
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
|
| 596 |
+
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
|
| 597 |
+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
| 598 |
+
#define GGML_F32Cx4_ADD _mm_add_ps
|
| 599 |
+
#define GGML_F32Cx4_MUL _mm_mul_ps
|
| 600 |
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
| 601 |
+
|
| 602 |
+
#define GGML_F16_VEC GGML_F32Cx4
|
| 603 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
| 604 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
| 605 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
| 606 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
| 607 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
| 608 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
| 609 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
| 610 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
| 611 |
+
|
| 612 |
+
#elif defined(__loongarch_asx)
|
| 613 |
+
|
| 614 |
+
#define GGML_SIMD
|
| 615 |
+
|
| 616 |
+
// F32 LASX
|
| 617 |
+
#define GGML_F32_STEP 32
|
| 618 |
+
#define GGML_F32_EPR 8
|
| 619 |
+
|
| 620 |
+
#define GGML_F32x8 __m256
|
| 621 |
+
#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
|
| 622 |
+
#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
|
| 623 |
+
#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
|
| 624 |
+
#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
|
| 625 |
+
#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
|
| 626 |
+
#define GGML_F32x8_ADD __lasx_xvfadd_s
|
| 627 |
+
#define GGML_F32x8_MUL __lasx_xvfmul_s
|
| 628 |
+
#define GGML_F32x8_REDUCE(res, x) \
|
| 629 |
+
do { \
|
| 630 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 631 |
+
for (int i = 0; i < offset; ++i) { \
|
| 632 |
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
| 633 |
+
} \
|
| 634 |
+
offset >>= 1; \
|
| 635 |
+
for (int i = 0; i < offset; ++i) { \
|
| 636 |
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
| 637 |
+
} \
|
| 638 |
+
offset >>= 1; \
|
| 639 |
+
for (int i = 0; i < offset; ++i) { \
|
| 640 |
+
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
| 641 |
+
} \
|
| 642 |
+
float *tmp_p = (float *)&x[0]; \
|
| 643 |
+
res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
|
| 644 |
+
} while (0)
|
| 645 |
+
// TODO: is this optimal ?
|
| 646 |
+
|
| 647 |
+
#define GGML_F32_VEC GGML_F32x8
|
| 648 |
+
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
| 649 |
+
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
| 650 |
+
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
| 651 |
+
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
| 652 |
+
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
| 653 |
+
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
| 654 |
+
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
| 655 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
| 656 |
+
|
| 657 |
+
// F16 LASX
|
| 658 |
+
|
| 659 |
+
#define GGML_F16_STEP 32
|
| 660 |
+
#define GGML_F16_EPR 8
|
| 661 |
+
|
| 662 |
+
// F16 arithmetic is not supported by LASX, so we use F32 instead
|
| 663 |
+
|
| 664 |
+
#define GGML_F32Cx8 __m256
|
| 665 |
+
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
| 666 |
+
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
| 667 |
+
|
| 668 |
+
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
| 669 |
+
__m256i a;
|
| 670 |
+
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
|
| 671 |
+
a = __lasx_xvpermi_d(a, 0 | (1 << 4));
|
| 672 |
+
return __lasx_xvfcvtl_s_h(a);
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
| 676 |
+
__m256i a = __lasx_xvfcvt_h_s(y, y);
|
| 677 |
+
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
|
| 678 |
+
memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
|
| 679 |
+
}
|
| 680 |
+
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
| 681 |
+
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
| 682 |
+
|
| 683 |
+
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
| 684 |
+
#define GGML_F32Cx8_ADD __lasx_xvfadd_s
|
| 685 |
+
#define GGML_F32Cx8_MUL __lasx_xvfmul_s
|
| 686 |
+
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
| 687 |
+
|
| 688 |
+
#define GGML_F16_VEC GGML_F32Cx8
|
| 689 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
| 690 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
| 691 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
| 692 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
| 693 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
| 694 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
| 695 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
| 696 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
| 697 |
+
|
| 698 |
+
#elif defined(__loongarch_sx)
|
| 699 |
+
|
| 700 |
+
#define GGML_SIMD
|
| 701 |
+
|
| 702 |
+
// F32 LSX
|
| 703 |
+
|
| 704 |
+
#define GGML_F32_STEP 32
|
| 705 |
+
#define GGML_F32_EPR 4
|
| 706 |
+
|
| 707 |
+
#define GGML_F32x4 __m128
|
| 708 |
+
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
| 709 |
+
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
| 710 |
+
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
| 711 |
+
#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
|
| 712 |
+
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
| 713 |
+
#define GGML_F32x4_ADD __lsx_vfadd_s
|
| 714 |
+
#define GGML_F32x4_MUL __lsx_vfmul_s
|
| 715 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 716 |
+
{ \
|
| 717 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 718 |
+
for (int i = 0; i < offset; ++i) { \
|
| 719 |
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
| 720 |
+
} \
|
| 721 |
+
offset >>= 1; \
|
| 722 |
+
for (int i = 0; i < offset; ++i) { \
|
| 723 |
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
| 724 |
+
} \
|
| 725 |
+
offset >>= 1; \
|
| 726 |
+
for (int i = 0; i < offset; ++i) { \
|
| 727 |
+
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
| 728 |
+
} \
|
| 729 |
+
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
| 730 |
+
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
| 731 |
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
| 732 |
+
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
| 733 |
+
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
| 734 |
+
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
| 735 |
+
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
| 736 |
+
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 740 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 741 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 742 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 743 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 744 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 745 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 746 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 747 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 748 |
+
|
| 749 |
+
// F16 LSX
|
| 750 |
+
|
| 751 |
+
#define GGML_F16_STEP 32
|
| 752 |
+
#define GGML_F16_EPR 4
|
| 753 |
+
|
| 754 |
+
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
| 755 |
+
float tmp[4];
|
| 756 |
+
|
| 757 |
+
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
| 758 |
+
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
| 759 |
+
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
| 760 |
+
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
| 761 |
+
|
| 762 |
+
return __lsx_vld(tmp, 0);
|
| 763 |
+
}
|
| 764 |
+
|
| 765 |
+
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
| 766 |
+
float arr[4];
|
| 767 |
+
|
| 768 |
+
__lsx_vst(y, arr, 0);
|
| 769 |
+
|
| 770 |
+
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
| 771 |
+
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
| 772 |
+
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
| 773 |
+
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
| 774 |
+
}
|
| 775 |
+
|
| 776 |
+
#define GGML_F32Cx4 __m128
|
| 777 |
+
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
| 778 |
+
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
| 779 |
+
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
| 780 |
+
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
| 781 |
+
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
| 782 |
+
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
| 783 |
+
#define GGML_F32Cx4_MUL __lsx_vfmul_s
|
| 784 |
+
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
| 785 |
+
|
| 786 |
+
#define GGML_F16_VEC GGML_F32Cx4
|
| 787 |
+
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
| 788 |
+
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
| 789 |
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
| 790 |
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
| 791 |
+
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
| 792 |
+
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
| 793 |
+
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
| 794 |
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
| 795 |
+
|
| 796 |
+
#elif defined(__VXE__) || defined(__VXE2__)
|
| 797 |
+
|
| 798 |
+
#define GGML_SIMD
|
| 799 |
+
|
| 800 |
+
// F32 s390x
|
| 801 |
+
|
| 802 |
+
#define GGML_F32_STEP 32
|
| 803 |
+
#define GGML_F32_EPR 4
|
| 804 |
+
|
| 805 |
+
#define GGML_F32x4 __vector float
|
| 806 |
+
#define GGML_F32x4_ZERO vec_splats(0.0f)
|
| 807 |
+
#define GGML_F32x4_SET1 vec_splats
|
| 808 |
+
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
| 809 |
+
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
| 810 |
+
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
| 811 |
+
#define GGML_F32x4_ADD vec_add
|
| 812 |
+
#define GGML_F32x4_MUL vec_mul
|
| 813 |
+
#define GGML_F32x4_REDUCE(res, x) \
|
| 814 |
+
{ \
|
| 815 |
+
int offset = GGML_F32_ARR >> 1; \
|
| 816 |
+
for (int i = 0; i < offset; ++i) { \
|
| 817 |
+
x[i] = vec_add(x[i], x[offset + i]); \
|
| 818 |
+
} \
|
| 819 |
+
offset >>= 1; \
|
| 820 |
+
for (int i = 0; i < offset; ++i) { \
|
| 821 |
+
x[i] = vec_add(x[i], x[offset + i]); \
|
| 822 |
+
} \
|
| 823 |
+
offset >>= 1; \
|
| 824 |
+
for (int i = 0; i < offset; ++i) { \
|
| 825 |
+
x[i] = vec_add(x[i], x[offset + i]); \
|
| 826 |
+
} \
|
| 827 |
+
res = vec_extract(x[0], 0) + \
|
| 828 |
+
vec_extract(x[0], 1) + \
|
| 829 |
+
vec_extract(x[0], 2) + \
|
| 830 |
+
vec_extract(x[0], 3); \
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
#define GGML_F32_VEC GGML_F32x4
|
| 834 |
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
| 835 |
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
| 836 |
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
| 837 |
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
| 838 |
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
| 839 |
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
| 840 |
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
| 841 |
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 842 |
+
|
| 843 |
+
// F16 s390x
|
| 844 |
+
#define GGML_F16_STEP GGML_F32_STEP
|
| 845 |
+
#define GGML_F16_EPR GGML_F32_EPR
|
| 846 |
+
|
| 847 |
+
static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
| 848 |
+
float tmp[4];
|
| 849 |
+
|
| 850 |
+
for (int i = 0; i < 4; i++) {
|
| 851 |
+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
| 852 |
+
}
|
| 853 |
+
|
| 854 |
+
return vec_xl(0, tmp);
|
| 855 |
+
}
|
| 856 |
+
|
| 857 |
+
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
| 858 |
+
float arr[4];
|
| 859 |
+
|
| 860 |
+
vec_xst(y, 0, arr);
|
| 861 |
+
|
| 862 |
+
for (int i = 0; i < 4; i++) {
|
| 863 |
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
| 864 |
+
}
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
+
#define GGML_F16_VEC GGML_F32x4
|
| 868 |
+
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
| 869 |
+
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
| 870 |
+
#define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
|
| 871 |
+
#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
|
| 872 |
+
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
| 873 |
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
| 874 |
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
| 875 |
+
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
| 876 |
+
|
| 877 |
+
#endif
|
| 878 |
+
|
| 879 |
+
// GGML_F32_ARR / GGML_F16_ARR
|
| 880 |
+
// number of registers to use per step
|
| 881 |
+
#ifdef GGML_SIMD
|
| 882 |
+
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
| 883 |
+
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
| 884 |
+
#endif
|
ggml/src/ggml-cpu/vec.cpp
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "vec.h"
|
| 2 |
+
|
| 3 |
+
#include <cassert>
|
| 4 |
+
|
| 5 |
+
#if defined(_MSC_VER)
|
| 6 |
+
// disable "possible loss of data" to avoid hundreds of casts
|
| 7 |
+
// we should just be careful :)
|
| 8 |
+
#pragma warning(disable: 4244 4267)
|
| 9 |
+
#endif
|
| 10 |
+
|
| 11 |
+
// precomputed gelu table for f16 (128 KB)
|
| 12 |
+
ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
| 13 |
+
|
| 14 |
+
// precomputed quick gelu table for f16 (128 KB)
|
| 15 |
+
ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
| 16 |
+
|
| 17 |
+
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
|
| 18 |
+
assert(nrc == 1);
|
| 19 |
+
GGML_UNUSED(nrc);
|
| 20 |
+
GGML_UNUSED(bx);
|
| 21 |
+
GGML_UNUSED(by);
|
| 22 |
+
GGML_UNUSED(bs);
|
| 23 |
+
|
| 24 |
+
#if defined(GGML_SIMD)
|
| 25 |
+
float sumf = 0.0f;
|
| 26 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 27 |
+
|
| 28 |
+
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
| 29 |
+
|
| 30 |
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
| 31 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 32 |
+
|
| 33 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 34 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 35 |
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
| 36 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 37 |
+
|
| 38 |
+
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
// reduce sum0..sum3 to sum0
|
| 43 |
+
GGML_F32_VEC_REDUCE(sumf, sum);
|
| 44 |
+
|
| 45 |
+
// leftovers
|
| 46 |
+
for (int i = np; i < n; ++i) {
|
| 47 |
+
sumf += x[i]*y[i];
|
| 48 |
+
}
|
| 49 |
+
#else
|
| 50 |
+
// scalar
|
| 51 |
+
ggml_float sumf = 0.0;
|
| 52 |
+
for (int i = 0; i < n; ++i) {
|
| 53 |
+
sumf += (ggml_float)(x[i]*y[i]);
|
| 54 |
+
}
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
*s = sumf;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
| 61 |
+
assert(nrc == 1);
|
| 62 |
+
GGML_UNUSED(nrc);
|
| 63 |
+
GGML_UNUSED(bx);
|
| 64 |
+
GGML_UNUSED(by);
|
| 65 |
+
GGML_UNUSED(bs);
|
| 66 |
+
int i = 0;
|
| 67 |
+
ggml_float sumf = 0;
|
| 68 |
+
|
| 69 |
+
#if defined(__AVX512BF16__)
|
| 70 |
+
__m512 c1 = _mm512_setzero_ps();
|
| 71 |
+
__m512 c2 = _mm512_setzero_ps();
|
| 72 |
+
for (; i + 64 <= n; i += 64) {
|
| 73 |
+
c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
|
| 74 |
+
m512bh(_mm512_loadu_si512((y + i))));
|
| 75 |
+
c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
|
| 76 |
+
m512bh(_mm512_loadu_si512((y + i + 32))));
|
| 77 |
+
}
|
| 78 |
+
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
|
| 79 |
+
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
| 80 |
+
|
| 81 |
+
#elif defined(__AVX512F__)
|
| 82 |
+
#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
|
| 83 |
+
__m512 c1 = _mm512_setzero_ps();
|
| 84 |
+
__m512 c2 = _mm512_setzero_ps();
|
| 85 |
+
for (; i + 32 <= n; i += 32) {
|
| 86 |
+
c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
|
| 87 |
+
c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
|
| 88 |
+
}
|
| 89 |
+
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
|
| 90 |
+
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
| 91 |
+
|
| 92 |
+
#undef LOAD
|
| 93 |
+
#elif defined(__AVX2__) || defined(__AVX__)
|
| 94 |
+
#if defined(__AVX2__)
|
| 95 |
+
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
|
| 96 |
+
#else
|
| 97 |
+
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
|
| 98 |
+
#endif
|
| 99 |
+
__m256 c1 = _mm256_setzero_ps();
|
| 100 |
+
__m256 c2 = _mm256_setzero_ps();
|
| 101 |
+
__m256 c3 = _mm256_setzero_ps();
|
| 102 |
+
__m256 c4 = _mm256_setzero_ps();
|
| 103 |
+
for (; i + 32 <= n; i += 32) {
|
| 104 |
+
c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
|
| 105 |
+
c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
|
| 106 |
+
c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
|
| 107 |
+
c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
|
| 108 |
+
}
|
| 109 |
+
__m128 g;
|
| 110 |
+
c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
|
| 111 |
+
_mm256_add_ps(c2, c4));
|
| 112 |
+
g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
|
| 113 |
+
_mm256_castps256_ps128(c1));
|
| 114 |
+
g = _mm_add_ps(g, _mm_movehl_ps(g, g));
|
| 115 |
+
g = _mm_add_ss(g, _mm_movehdup_ps(g));
|
| 116 |
+
sumf += (ggml_float)_mm_cvtss_f32(g);
|
| 117 |
+
|
| 118 |
+
#undef LOAD
|
| 119 |
+
#endif
|
| 120 |
+
|
| 121 |
+
for (; i < n; ++i) {
|
| 122 |
+
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
|
| 123 |
+
GGML_BF16_TO_FP32(y[i]));
|
| 124 |
+
}
|
| 125 |
+
*s = sumf;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
| 129 |
+
assert(nrc == 1);
|
| 130 |
+
GGML_UNUSED(nrc);
|
| 131 |
+
GGML_UNUSED(bx);
|
| 132 |
+
GGML_UNUSED(by);
|
| 133 |
+
GGML_UNUSED(bs);
|
| 134 |
+
|
| 135 |
+
ggml_float sumf = 0.0;
|
| 136 |
+
|
| 137 |
+
#if defined(GGML_SIMD)
|
| 138 |
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
| 139 |
+
|
| 140 |
+
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
| 141 |
+
|
| 142 |
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
| 143 |
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
| 144 |
+
|
| 145 |
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
| 146 |
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
| 147 |
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
| 148 |
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
| 149 |
+
|
| 150 |
+
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
| 151 |
+
}
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
// reduce sum0..sum3 to sum0
|
| 155 |
+
GGML_F16_VEC_REDUCE(sumf, sum);
|
| 156 |
+
|
| 157 |
+
// leftovers
|
| 158 |
+
for (int i = np; i < n; ++i) {
|
| 159 |
+
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
|
| 160 |
+
}
|
| 161 |
+
#else
|
| 162 |
+
for (int i = 0; i < n; ++i) {
|
| 163 |
+
sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
|
| 164 |
+
}
|
| 165 |
+
#endif
|
| 166 |
+
|
| 167 |
+
*s = sumf;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
| 171 |
+
int i = 0;
|
| 172 |
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
| 173 |
+
for (; i + 15 < n; i += 16) {
|
| 174 |
+
_mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
|
| 175 |
+
}
|
| 176 |
+
#elif defined(__AVX2__) && defined(__FMA__)
|
| 177 |
+
for (; i + 7 < n; i += 8) {
|
| 178 |
+
_mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
|
| 179 |
+
}
|
| 180 |
+
#elif defined(__SSE2__)
|
| 181 |
+
for (; i + 3 < n; i += 4) {
|
| 182 |
+
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
| 183 |
+
}
|
| 184 |
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
| 185 |
+
for (; i + 3 < n; i += 4) {
|
| 186 |
+
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
| 187 |
+
}
|
| 188 |
+
#endif
|
| 189 |
+
for (; i < n; ++i) {
|
| 190 |
+
y[i] = ggml_silu_f32(x[i]);
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
| 195 |
+
int i = 0;
|
| 196 |
+
ggml_float sum = 0;
|
| 197 |
+
#if defined(__AVX512F__) && defined(__AVX512DQ__)
|
| 198 |
+
for (; i + 15 < n; i += 16) {
|
| 199 |
+
__m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
|
| 200 |
+
_mm512_set1_ps(max)));
|
| 201 |
+
_mm512_storeu_ps(y + i, val);
|
| 202 |
+
sum += (ggml_float)_mm512_reduce_add_ps(val);
|
| 203 |
+
}
|
| 204 |
+
#elif defined(__AVX2__) && defined(__FMA__)
|
| 205 |
+
for (; i + 7 < n; i += 8) {
|
| 206 |
+
__m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
|
| 207 |
+
_mm256_set1_ps(max)));
|
| 208 |
+
_mm256_storeu_ps(y + i, val);
|
| 209 |
+
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
|
| 210 |
+
_mm256_castps256_ps128(val));
|
| 211 |
+
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
|
| 212 |
+
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
|
| 213 |
+
sum += (ggml_float)_mm_cvtss_f32(val2);
|
| 214 |
+
}
|
| 215 |
+
#elif defined(__SSE2__)
|
| 216 |
+
for (; i + 3 < n; i += 4) {
|
| 217 |
+
__m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
|
| 218 |
+
_mm_set1_ps(max)));
|
| 219 |
+
_mm_storeu_ps(y + i, val);
|
| 220 |
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
| 221 |
+
val = _mm_add_ps(val, _mm_movehl_ps(val, val));
|
| 222 |
+
val = _mm_add_ss(val, _mm_movehdup_ps(val));
|
| 223 |
+
#else
|
| 224 |
+
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
|
| 225 |
+
val = _mm_add_ps(val, tmp);
|
| 226 |
+
tmp = _mm_movehl_ps(tmp, val);
|
| 227 |
+
val = _mm_add_ss(val, tmp);
|
| 228 |
+
#endif
|
| 229 |
+
sum += (ggml_float)_mm_cvtss_f32(val);
|
| 230 |
+
}
|
| 231 |
+
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
| 232 |
+
for (; i + 3 < n; i += 4) {
|
| 233 |
+
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|
| 234 |
+
vdupq_n_f32(max)));
|
| 235 |
+
vst1q_f32(y + i, val);
|
| 236 |
+
sum += (ggml_float)vaddvq_f32(val);
|
| 237 |
+
}
|
| 238 |
+
#endif
|
| 239 |
+
for (; i < n; ++i) {
|
| 240 |
+
float val = expf(x[i] - max);
|
| 241 |
+
sum += (ggml_float)val;
|
| 242 |
+
y[i] = val;
|
| 243 |
+
}
|
| 244 |
+
return sum;
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
|
| 248 |
+
// log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
|
| 249 |
+
|
| 250 |
+
int i = 0;
|
| 251 |
+
ggml_float sum = 0;
|
| 252 |
+
for (; i < n; ++i) {
|
| 253 |
+
float val = x[i] - max;
|
| 254 |
+
y[i] = val;
|
| 255 |
+
sum += (ggml_float)expf(val);
|
| 256 |
+
}
|
| 257 |
+
return sum = (ggml_float)logf(sum);
|
| 258 |
+
}
|
ggml/src/ggml-cpu/vec.h
ADDED
|
@@ -0,0 +1,802 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Vectorized functions for fundamental operations
|
| 2 |
+
|
| 3 |
+
#pragma once
|
| 4 |
+
|
| 5 |
+
#include "ggml-impl.h"
|
| 6 |
+
#include "simd-mappings.h"
|
| 7 |
+
#include "ggml.h"
|
| 8 |
+
|
| 9 |
+
#if defined(GGML_USE_ACCELERATE)
|
| 10 |
+
#include <Accelerate/Accelerate.h>
|
| 11 |
+
#endif
|
| 12 |
+
|
| 13 |
+
// floating point type used to accumulate sums
|
| 14 |
+
typedef double ggml_float;
|
| 15 |
+
|
| 16 |
+
#define GGML_GELU_FP16
|
| 17 |
+
#define GGML_GELU_QUICK_FP16
|
| 18 |
+
|
| 19 |
+
#define GGML_SOFT_MAX_UNROLL 4
|
| 20 |
+
#define GGML_VEC_DOT_UNROLL 2
|
| 21 |
+
#define GGML_VEC_MAD_UNROLL 32
|
| 22 |
+
|
| 23 |
+
#ifdef __cplusplus
|
| 24 |
+
extern "C" {
|
| 25 |
+
#endif
|
| 26 |
+
|
| 27 |
+
//
|
| 28 |
+
// global data
|
| 29 |
+
//
|
| 30 |
+
|
| 31 |
+
// precomputed gelu table for f16 (128 KB)
|
| 32 |
+
extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
| 33 |
+
|
| 34 |
+
// precomputed quick gelu table for f16 (128 KB)
|
| 35 |
+
extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
| 36 |
+
|
| 37 |
+
//
|
| 38 |
+
// fundamental operations
|
| 39 |
+
//
|
| 40 |
+
|
| 41 |
+
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
| 42 |
+
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
| 43 |
+
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
| 44 |
+
|
| 45 |
+
void ggml_vec_silu_f32(const int n, float * y, const float * x);
|
| 46 |
+
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
|
| 47 |
+
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
|
| 48 |
+
|
| 49 |
+
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 50 |
+
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 51 |
+
|
| 52 |
+
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 53 |
+
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
| 54 |
+
|
| 55 |
+
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 56 |
+
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 57 |
+
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
| 58 |
+
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
| 59 |
+
for (int i = 0; i < n; ++i) {
|
| 60 |
+
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
| 64 |
+
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
| 65 |
+
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
|
| 66 |
+
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
| 67 |
+
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
| 68 |
+
for (int i = 0; i < n; ++i) {
|
| 69 |
+
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
| 73 |
+
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
| 74 |
+
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
| 75 |
+
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 76 |
+
for (int i = 0; i < n; ++i) {
|
| 77 |
+
y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
| 82 |
+
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
| 83 |
+
for (int i = 0; i < n; ++i) {
|
| 84 |
+
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
| 88 |
+
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
| 89 |
+
for (int i = 0; i < n; ++i) {
|
| 90 |
+
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
// compute GGML_VEC_DOT_UNROLL dot products at once
|
| 95 |
+
// xs - x row stride in bytes
|
| 96 |
+
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
|
| 97 |
+
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
|
| 98 |
+
|
| 99 |
+
ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
|
| 100 |
+
|
| 101 |
+
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
|
| 102 |
+
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
#if defined(GGML_SIMD)
|
| 106 |
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
| 107 |
+
|
| 108 |
+
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
| 109 |
+
|
| 110 |
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
| 111 |
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
| 112 |
+
|
| 113 |
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
| 114 |
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
| 115 |
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
| 116 |
+
|
| 117 |
+
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
| 118 |
+
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
|
| 119 |
+
|
| 120 |
+
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
// reduce sum0..sum3 to sum0
|
| 126 |
+
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
| 127 |
+
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
// leftovers
|
| 131 |
+
for (int i = np; i < n; ++i) {
|
| 132 |
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
| 133 |
+
sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
#else
|
| 137 |
+
for (int i = 0; i < n; ++i) {
|
| 138 |
+
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
| 139 |
+
sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
#endif
|
| 143 |
+
|
| 144 |
+
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
|
| 145 |
+
s[i] = (float)sumf[i];
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
| 150 |
+
#if defined(GGML_SIMD)
|
| 151 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 152 |
+
|
| 153 |
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
| 154 |
+
|
| 155 |
+
GGML_F32_VEC ax[GGML_F32_ARR];
|
| 156 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 157 |
+
|
| 158 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 159 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 160 |
+
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
| 161 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 162 |
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
|
| 163 |
+
|
| 164 |
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// leftovers
|
| 169 |
+
for (int i = np; i < n; ++i) {
|
| 170 |
+
y[i] += x[i]*v;
|
| 171 |
+
}
|
| 172 |
+
#else
|
| 173 |
+
// scalar
|
| 174 |
+
for (int i = 0; i < n; ++i) {
|
| 175 |
+
y[i] += x[i]*v;
|
| 176 |
+
}
|
| 177 |
+
#endif
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
| 181 |
+
#if defined(GGML_SIMD)
|
| 182 |
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
| 183 |
+
|
| 184 |
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
| 185 |
+
|
| 186 |
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
| 187 |
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
| 188 |
+
|
| 189 |
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
| 190 |
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
| 191 |
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
| 192 |
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
| 193 |
+
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
| 194 |
+
|
| 195 |
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
| 196 |
+
}
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
// leftovers
|
| 200 |
+
for (int i = np; i < n; ++i) {
|
| 201 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 202 |
+
}
|
| 203 |
+
#else
|
| 204 |
+
// scalar
|
| 205 |
+
for (int i = 0; i < n; ++i) {
|
| 206 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 207 |
+
}
|
| 208 |
+
#endif
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
// xs and vs are byte strides of x and v
|
| 212 |
+
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
|
| 213 |
+
|
| 214 |
+
const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
|
| 215 |
+
const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
|
| 216 |
+
|
| 217 |
+
for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
|
| 218 |
+
x[i] = (const float *) ((const char *) xv + i*xs);
|
| 219 |
+
v[i] = (const float *) ((const char *) vv + i*vs);
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
#if defined(GGML_SIMD)
|
| 223 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 224 |
+
|
| 225 |
+
GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
|
| 226 |
+
|
| 227 |
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
| 228 |
+
vx[k] = GGML_F32_VEC_SET1(v[k][0]);
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
|
| 232 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 233 |
+
|
| 234 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 235 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 236 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 237 |
+
|
| 238 |
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
| 239 |
+
ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
|
| 240 |
+
ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
| 244 |
+
}
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
// leftovers
|
| 248 |
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
| 249 |
+
for (int i = np; i < n; ++i) {
|
| 250 |
+
y[i] += x[k][i]*v[k][0];
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
#else
|
| 254 |
+
// scalar
|
| 255 |
+
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
| 256 |
+
for (int i = 0; i < n; ++i) {
|
| 257 |
+
y[i] += x[k][i]*v[k][0];
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
#endif
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
|
| 264 |
+
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
| 265 |
+
#if defined(GGML_USE_ACCELERATE)
|
| 266 |
+
vDSP_vsmul(y, 1, &v, y, 1, n);
|
| 267 |
+
#elif defined(GGML_SIMD)
|
| 268 |
+
const int np = (n & ~(GGML_F32_STEP - 1));
|
| 269 |
+
|
| 270 |
+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
| 271 |
+
|
| 272 |
+
GGML_F32_VEC ay[GGML_F32_ARR];
|
| 273 |
+
|
| 274 |
+
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
| 275 |
+
for (int j = 0; j < GGML_F32_ARR; j++) {
|
| 276 |
+
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
|
| 277 |
+
ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
|
| 278 |
+
|
| 279 |
+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
| 280 |
+
}
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
// leftovers
|
| 284 |
+
for (int i = np; i < n; ++i) {
|
| 285 |
+
y[i] *= v;
|
| 286 |
+
}
|
| 287 |
+
#else
|
| 288 |
+
// scalar
|
| 289 |
+
for (int i = 0; i < n; ++i) {
|
| 290 |
+
y[i] *= v;
|
| 291 |
+
}
|
| 292 |
+
#endif
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
| 296 |
+
#if defined(GGML_SIMD)
|
| 297 |
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
| 298 |
+
|
| 299 |
+
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
| 300 |
+
|
| 301 |
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
| 302 |
+
|
| 303 |
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
| 304 |
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
| 305 |
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
| 306 |
+
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
| 307 |
+
|
| 308 |
+
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
| 309 |
+
}
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
// leftovers
|
| 313 |
+
for (int i = np; i < n; ++i) {
|
| 314 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
| 315 |
+
}
|
| 316 |
+
#else
|
| 317 |
+
// scalar
|
| 318 |
+
for (int i = 0; i < n; ++i) {
|
| 319 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
| 320 |
+
}
|
| 321 |
+
#endif
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
| 325 |
+
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
| 326 |
+
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 327 |
+
for (int i = 0; i < n; ++i) {
|
| 328 |
+
float v = GGML_FP16_TO_FP32(x[i]);
|
| 329 |
+
y[i] = GGML_FP32_TO_FP16(v*v);
|
| 330 |
+
}
|
| 331 |
+
}
|
| 332 |
+
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
| 333 |
+
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 334 |
+
for (int i = 0; i < n; ++i) {
|
| 335 |
+
y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
| 339 |
+
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 340 |
+
for (int i = 0; i < n; ++i) {
|
| 341 |
+
y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
|
| 342 |
+
}
|
| 343 |
+
}
|
| 344 |
+
inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
|
| 345 |
+
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 346 |
+
for (int i = 0; i < n; ++i) {
|
| 347 |
+
y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
|
| 348 |
+
}
|
| 349 |
+
}
|
| 350 |
+
inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
|
| 351 |
+
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 352 |
+
for (int i = 0; i < n; ++i) {
|
| 353 |
+
y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
|
| 354 |
+
}
|
| 355 |
+
}
|
| 356 |
+
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
| 357 |
+
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 358 |
+
for (int i = 0; i < n; ++i) {
|
| 359 |
+
y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
| 363 |
+
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 364 |
+
for (int i = 0; i < n; ++i) {
|
| 365 |
+
float v = GGML_FP16_TO_FP32(x[i]);
|
| 366 |
+
y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
| 367 |
+
}
|
| 368 |
+
}
|
| 369 |
+
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
| 370 |
+
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 371 |
+
for (int i = 0; i < n; ++i) {
|
| 372 |
+
y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
| 373 |
+
}
|
| 374 |
+
}
|
| 375 |
+
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
| 376 |
+
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 377 |
+
for (int i = 0; i < n; ++i) {
|
| 378 |
+
y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
|
| 379 |
+
}
|
| 380 |
+
}
|
| 381 |
+
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
| 382 |
+
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 383 |
+
for (int i = 0; i < n; ++i) {
|
| 384 |
+
y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
|
| 385 |
+
}
|
| 386 |
+
}
|
| 387 |
+
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
| 388 |
+
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 389 |
+
for (int i = 0; i < n; ++i) {
|
| 390 |
+
float v = GGML_FP16_TO_FP32(x[i]);
|
| 391 |
+
y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
| 392 |
+
}
|
| 393 |
+
}
|
| 394 |
+
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
| 395 |
+
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
|
| 396 |
+
for (int i = 0; i < n; ++i) {
|
| 397 |
+
float v = GGML_FP16_TO_FP32(x[i]);
|
| 398 |
+
y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
| 399 |
+
}
|
| 400 |
+
}
|
| 401 |
+
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
| 402 |
+
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 403 |
+
for (int i = 0; i < n; ++i) {
|
| 404 |
+
y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
|
| 405 |
+
}
|
| 406 |
+
}
|
| 407 |
+
// TODO: optimize performance
|
| 408 |
+
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
| 409 |
+
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 410 |
+
for (int i = 0; i < n; ++i) {
|
| 411 |
+
float v = GGML_FP16_TO_FP32(x[i]);
|
| 412 |
+
y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
| 413 |
+
}
|
| 414 |
+
}
|
| 415 |
+
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
| 416 |
+
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 417 |
+
for (int i = 0; i < n; ++i) {
|
| 418 |
+
y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
| 419 |
+
}
|
| 420 |
+
}
|
| 421 |
+
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
| 422 |
+
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 423 |
+
for (int i = 0; i < n; ++i) {
|
| 424 |
+
y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
|
| 425 |
+
}
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
static const float GELU_COEF_A = 0.044715f;
|
| 429 |
+
static const float GELU_QUICK_COEF = -1.702f;
|
| 430 |
+
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
| 431 |
+
|
| 432 |
+
inline static float ggml_gelu_f32(float x) {
|
| 433 |
+
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 437 |
+
const uint16_t * i16 = (const uint16_t *) x;
|
| 438 |
+
for (int i = 0; i < n; ++i) {
|
| 439 |
+
y[i] = ggml_table_gelu_f16[i16[i]];
|
| 440 |
+
}
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
#ifdef GGML_GELU_FP16
|
| 444 |
+
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
| 445 |
+
uint16_t t;
|
| 446 |
+
for (int i = 0; i < n; ++i) {
|
| 447 |
+
if (x[i] <= -10.0f) {
|
| 448 |
+
y[i] = 0.0f;
|
| 449 |
+
} else if (x[i] >= 10.0f) {
|
| 450 |
+
y[i] = x[i];
|
| 451 |
+
} else {
|
| 452 |
+
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
| 453 |
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
| 454 |
+
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
|
| 455 |
+
}
|
| 456 |
+
}
|
| 457 |
+
}
|
| 458 |
+
#else
|
| 459 |
+
inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
| 460 |
+
for (int i = 0; i < n; ++i) {
|
| 461 |
+
y[i] = ggml_gelu_f32(x[i]);
|
| 462 |
+
}
|
| 463 |
+
}
|
| 464 |
+
#endif
|
| 465 |
+
|
| 466 |
+
inline static float ggml_gelu_quick_f32(float x) {
|
| 467 |
+
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 471 |
+
// const uint16_t * i16 = (const uint16_t *) x;
|
| 472 |
+
// for (int i = 0; i < n; ++i) {
|
| 473 |
+
// y[i] = ggml_table_gelu_quick_f16[i16[i]];
|
| 474 |
+
// }
|
| 475 |
+
//}
|
| 476 |
+
|
| 477 |
+
#ifdef GGML_GELU_QUICK_FP16
|
| 478 |
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
| 479 |
+
uint16_t t;
|
| 480 |
+
for (int i = 0; i < n; ++i) {
|
| 481 |
+
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
| 482 |
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
| 483 |
+
y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
|
| 484 |
+
}
|
| 485 |
+
}
|
| 486 |
+
#else
|
| 487 |
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
| 488 |
+
for (int i = 0; i < n; ++i) {
|
| 489 |
+
y[i] = ggml_gelu_quick_f32(x[i]);
|
| 490 |
+
}
|
| 491 |
+
}
|
| 492 |
+
#endif
|
| 493 |
+
|
| 494 |
+
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 495 |
+
for (int i = 0; i < n; ++i) {
|
| 496 |
+
float v = GGML_FP16_TO_FP32(x[i]);
|
| 497 |
+
y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
| 498 |
+
}
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
// Sigmoid Linear Unit (SiLU) function
|
| 502 |
+
inline static float ggml_silu_f32(float x) {
|
| 503 |
+
return x/(1.0f + expf(-x));
|
| 504 |
+
}
|
| 505 |
+
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
| 506 |
+
float v = GGML_FP16_TO_FP32(x);
|
| 507 |
+
return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
| 508 |
+
}
|
| 509 |
+
|
| 510 |
+
#if __FINITE_MATH_ONLY__
|
| 511 |
+
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
| 512 |
+
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
| 513 |
+
#endif
|
| 514 |
+
|
| 515 |
+
#if defined(__ARM_NEON) && defined(__aarch64__)
|
| 516 |
+
|
| 517 |
+
// adapted from arm limited optimized routine
|
| 518 |
+
// the maximum error is 1.45358 plus 0.5 ulps
|
| 519 |
+
// numbers above 88.38 will flush to infinity
|
| 520 |
+
// numbers beneath -103.97 will flush to zero
|
| 521 |
+
inline static float32x4_t ggml_v_expf(float32x4_t x) {
|
| 522 |
+
const float32x4_t r = vdupq_n_f32(0x1.8p23f);
|
| 523 |
+
const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
|
| 524 |
+
const float32x4_t n = vsubq_f32(z, r);
|
| 525 |
+
const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
|
| 526 |
+
vdupq_n_f32(0x1.7f7d1cp-20f));
|
| 527 |
+
const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
|
| 528 |
+
const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
|
| 529 |
+
const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
|
| 530 |
+
const float32x4_t u = vmulq_f32(b, b);
|
| 531 |
+
const float32x4_t j = vfmaq_f32(
|
| 532 |
+
vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
|
| 533 |
+
vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
|
| 534 |
+
vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
|
| 535 |
+
if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
|
| 536 |
+
return vfmaq_f32(k, j, k);
|
| 537 |
+
const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
|
| 538 |
+
const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
|
| 539 |
+
const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
|
| 540 |
+
return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
|
| 541 |
+
vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
// computes silu x/(1+exp(-x)) in single precision vector
|
| 545 |
+
inline static float32x4_t ggml_v_silu(float32x4_t x) {
|
| 546 |
+
const float32x4_t one = vdupq_n_f32(1.0f);
|
| 547 |
+
const float32x4_t zero = vdupq_n_f32(0.0f);
|
| 548 |
+
const float32x4_t neg_x = vsubq_f32(zero, x);
|
| 549 |
+
const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
|
| 550 |
+
const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
|
| 551 |
+
return vdivq_f32(x, one_plus_exp_neg_x);
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
#elif defined(__AVX512F__) && defined(__AVX512DQ__)
|
| 555 |
+
|
| 556 |
+
// adapted from arm limited optimized routine
|
| 557 |
+
// the maximum error is 1.45358 plus 0.5 ulps
|
| 558 |
+
// numbers above 88.38 will flush to infinity
|
| 559 |
+
// numbers beneath -103.97 will flush to zero
|
| 560 |
+
inline static __m512 ggml_v_expf(__m512 x) {
|
| 561 |
+
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
| 562 |
+
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
| 563 |
+
const __m512 n = _mm512_sub_ps(z, r);
|
| 564 |
+
const __m512 b =
|
| 565 |
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
| 566 |
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
| 567 |
+
const __mmask16 d =
|
| 568 |
+
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
| 569 |
+
const __m512 u = _mm512_mul_ps(b, b);
|
| 570 |
+
const __m512 j = _mm512_fmadd_ps(
|
| 571 |
+
_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
| 572 |
+
_mm512_set1_ps(0x1.573e2ep-5f)),
|
| 573 |
+
u,
|
| 574 |
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
| 575 |
+
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
| 576 |
+
u,
|
| 577 |
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
|
| 578 |
+
const __m512 res = _mm512_scalef_ps(j, n);
|
| 579 |
+
if (_mm512_kortestz(d, d))
|
| 580 |
+
return res;
|
| 581 |
+
const __m512 zero = _mm512_setzero_ps();
|
| 582 |
+
const __m512 alt = _mm512_mask_blend_ps(
|
| 583 |
+
_mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
|
| 584 |
+
return _mm512_mask_blend_ps(d, res, alt);
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
+
// computes silu x/(1+exp(-x)) in single precision vector
|
| 588 |
+
inline static __m512 ggml_v_silu(__m512 x) {
|
| 589 |
+
const __m512 one = _mm512_set1_ps(1);
|
| 590 |
+
const __m512 zero = _mm512_setzero_ps();
|
| 591 |
+
const __m512 neg_x = _mm512_sub_ps(zero, x);
|
| 592 |
+
const __m512 exp_neg_x = ggml_v_expf(neg_x);
|
| 593 |
+
const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
|
| 594 |
+
return _mm512_div_ps(x, one_plus_exp_neg_x);
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
#elif defined(__AVX2__) && defined(__FMA__)
|
| 598 |
+
|
| 599 |
+
// adapted from arm limited optimized routine
|
| 600 |
+
// the maximum error is 1.45358 plus 0.5 ulps
|
| 601 |
+
// numbers above 88.38 will flush to infinity
|
| 602 |
+
// numbers beneath -103.97 will flush to zero
|
| 603 |
+
inline static __m256 ggml_v_expf(__m256 x) {
|
| 604 |
+
const __m256 r = _mm256_set1_ps(0x1.8p23f);
|
| 605 |
+
const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
|
| 606 |
+
const __m256 n = _mm256_sub_ps(z, r);
|
| 607 |
+
const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
|
| 608 |
+
_mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
|
| 609 |
+
const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
|
| 610 |
+
const __m256 k = _mm256_castsi256_ps(
|
| 611 |
+
_mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
|
| 612 |
+
const __m256i c = _mm256_castps_si256(
|
| 613 |
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
| 614 |
+
_mm256_set1_ps(126), _CMP_GT_OQ));
|
| 615 |
+
const __m256 u = _mm256_mul_ps(b, b);
|
| 616 |
+
const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
|
| 617 |
+
_mm256_set1_ps(0x1.573e2ep-5f)), u,
|
| 618 |
+
_mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
|
| 619 |
+
_mm256_set1_ps(0x1.fffdb6p-2f))),
|
| 620 |
+
u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
|
| 621 |
+
if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
|
| 622 |
+
return _mm256_fmadd_ps(j, k, k);
|
| 623 |
+
const __m256i g = _mm256_and_si256(
|
| 624 |
+
_mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
|
| 625 |
+
_mm256_set1_epi32(0x82000000u));
|
| 626 |
+
const __m256 s1 =
|
| 627 |
+
_mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
|
| 628 |
+
const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
|
| 629 |
+
const __m256i d = _mm256_castps_si256(
|
| 630 |
+
_mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
|
| 631 |
+
_mm256_set1_ps(192), _CMP_GT_OQ));
|
| 632 |
+
return _mm256_or_ps(
|
| 633 |
+
_mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
|
| 634 |
+
_mm256_andnot_ps(
|
| 635 |
+
_mm256_castsi256_ps(d),
|
| 636 |
+
_mm256_or_ps(
|
| 637 |
+
_mm256_and_ps(_mm256_castsi256_ps(c),
|
| 638 |
+
_mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
|
| 639 |
+
_mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
// computes silu x/(1+exp(-x)) in single precision vector
|
| 643 |
+
inline static __m256 ggml_v_silu(__m256 x) {
|
| 644 |
+
const __m256 one = _mm256_set1_ps(1);
|
| 645 |
+
const __m256 zero = _mm256_setzero_ps();
|
| 646 |
+
const __m256 neg_x = _mm256_sub_ps(zero, x);
|
| 647 |
+
const __m256 exp_neg_x = ggml_v_expf(neg_x);
|
| 648 |
+
const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
|
| 649 |
+
return _mm256_div_ps(x, one_plus_exp_neg_x);
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
|
| 653 |
+
|
| 654 |
+
#if defined(__FMA__)
|
| 655 |
+
#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
|
| 656 |
+
#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
|
| 657 |
+
#else
|
| 658 |
+
#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
|
| 659 |
+
#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
|
| 660 |
+
#endif
|
| 661 |
+
|
| 662 |
+
// adapted from arm limited optimized routine
|
| 663 |
+
// the maximum error is 1.45358 plus 0.5 ulps
|
| 664 |
+
// numbers above 88.38 will flush to infinity
|
| 665 |
+
// numbers beneath -103.97 will flush to zero
|
| 666 |
+
inline static __m128 ggml_v_expf(__m128 x) {
|
| 667 |
+
const __m128 r = _mm_set1_ps(0x1.8p23f);
|
| 668 |
+
const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
|
| 669 |
+
const __m128 n = _mm_sub_ps(z, r);
|
| 670 |
+
const __m128 b =
|
| 671 |
+
NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
|
| 672 |
+
const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
|
| 673 |
+
const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
|
| 674 |
+
const __m128i c =
|
| 675 |
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
|
| 676 |
+
const __m128 u = _mm_mul_ps(b, b);
|
| 677 |
+
const __m128 j =
|
| 678 |
+
MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
|
| 679 |
+
MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
|
| 680 |
+
u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
|
| 681 |
+
if (!_mm_movemask_epi8(c))
|
| 682 |
+
return MADD128(j, k, k);
|
| 683 |
+
const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
|
| 684 |
+
_mm_set1_epi32(0x82000000u));
|
| 685 |
+
const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
|
| 686 |
+
const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
|
| 687 |
+
const __m128i d =
|
| 688 |
+
_mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
|
| 689 |
+
return _mm_or_ps(
|
| 690 |
+
_mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
|
| 691 |
+
_mm_andnot_ps(_mm_castsi128_ps(d),
|
| 692 |
+
_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
|
| 693 |
+
_mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
// computes silu x/(1+exp(-x)) in single precision vector
|
| 697 |
+
inline static __m128 ggml_v_silu(__m128 x) {
|
| 698 |
+
const __m128 one = _mm_set1_ps(1);
|
| 699 |
+
const __m128 zero = _mm_setzero_ps();
|
| 700 |
+
const __m128 neg_x = _mm_sub_ps(zero, x);
|
| 701 |
+
const __m128 exp_neg_x = ggml_v_expf(neg_x);
|
| 702 |
+
const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
|
| 703 |
+
return _mm_div_ps(x, one_plus_exp_neg_x);
|
| 704 |
+
}
|
| 705 |
+
|
| 706 |
+
#endif // __ARM_NEON / __AVX2__ / __SSE2__
|
| 707 |
+
|
| 708 |
+
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
| 709 |
+
for (int i = 0; i < n; ++i) {
|
| 710 |
+
y[i] = ggml_silu_f16(x[i]);
|
| 711 |
+
}
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
+
inline static float ggml_silu_backward_f32(float x, float dy) {
|
| 715 |
+
const float s = 1.0f/(1.0f + expf(-x));
|
| 716 |
+
return dy*s*(1.0f + x*(1.0f - s));
|
| 717 |
+
}
|
| 718 |
+
|
| 719 |
+
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
|
| 720 |
+
const float v = GGML_FP16_TO_FP32(x);
|
| 721 |
+
const float s = 1.0f/(1.0f + expf(-v));
|
| 722 |
+
return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
| 723 |
+
}
|
| 724 |
+
|
| 725 |
+
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
| 726 |
+
for (int i = 0; i < n; ++i) {
|
| 727 |
+
dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
|
| 728 |
+
}
|
| 729 |
+
}
|
| 730 |
+
|
| 731 |
+
inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
|
| 732 |
+
for (int i = 0; i < n; ++i) {
|
| 733 |
+
dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
|
| 734 |
+
}
|
| 735 |
+
}
|
| 736 |
+
|
| 737 |
+
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
| 738 |
+
#ifndef GGML_USE_ACCELERATE
|
| 739 |
+
ggml_float sum = 0.0;
|
| 740 |
+
for (int i = 0; i < n; ++i) {
|
| 741 |
+
sum += (ggml_float)x[i];
|
| 742 |
+
}
|
| 743 |
+
*s = (float)sum;
|
| 744 |
+
#else
|
| 745 |
+
vDSP_sve(x, 1, s, n);
|
| 746 |
+
#endif
|
| 747 |
+
}
|
| 748 |
+
|
| 749 |
+
inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
|
| 750 |
+
ggml_float sum = 0.0;
|
| 751 |
+
for (int i = 0; i < n; ++i) {
|
| 752 |
+
sum += (ggml_float)x[i];
|
| 753 |
+
}
|
| 754 |
+
*s = sum;
|
| 755 |
+
}
|
| 756 |
+
|
| 757 |
+
inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
|
| 758 |
+
float sum = 0.0f;
|
| 759 |
+
for (int i = 0; i < n; ++i) {
|
| 760 |
+
sum += GGML_FP16_TO_FP32(x[i]);
|
| 761 |
+
}
|
| 762 |
+
*s = sum;
|
| 763 |
+
}
|
| 764 |
+
|
| 765 |
+
inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
|
| 766 |
+
float sum = 0.0f;
|
| 767 |
+
for (int i = 0; i < n; ++i) {
|
| 768 |
+
sum += GGML_BF16_TO_FP32(x[i]);
|
| 769 |
+
}
|
| 770 |
+
*s = sum;
|
| 771 |
+
}
|
| 772 |
+
|
| 773 |
+
inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
|
| 774 |
+
#ifndef GGML_USE_ACCELERATE
|
| 775 |
+
float max = -INFINITY;
|
| 776 |
+
for (int i = 0; i < n; ++i) {
|
| 777 |
+
max = MAX(max, x[i]);
|
| 778 |
+
}
|
| 779 |
+
*s = max;
|
| 780 |
+
#else
|
| 781 |
+
vDSP_maxv(x, 1, s, n);
|
| 782 |
+
#endif
|
| 783 |
+
}
|
| 784 |
+
|
| 785 |
+
inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
|
| 786 |
+
ggml_vec_norm_f32(n, s, x);
|
| 787 |
+
*s = 1.f/(*s);
|
| 788 |
+
}
|
| 789 |
+
|
| 790 |
+
inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
|
| 791 |
+
float max = -INFINITY;
|
| 792 |
+
int idx = 0;
|
| 793 |
+
for (int i = 0; i < n; ++i) {
|
| 794 |
+
max = MAX(max, x[i]);
|
| 795 |
+
if (max == x[i]) { idx = i; }
|
| 796 |
+
}
|
| 797 |
+
*s = idx;
|
| 798 |
+
}
|
| 799 |
+
|
| 800 |
+
#ifdef __cplusplus
|
| 801 |
+
}
|
| 802 |
+
#endif
|