Spaces:
Sleeping
Sleeping
Thomas Fitzsimmons
commited on
Commit
·
0d5a830
1
Parent(s):
f39e0be
ggml : add f16 acceleration for POWER9 ppc64le
Browse files
Makefile
CHANGED
|
@@ -105,6 +105,12 @@ endif
|
|
| 105 |
ifeq ($(UNAME_M),amd64)
|
| 106 |
CFLAGS += -mavx -mavx2 -mfma -mf16c
|
| 107 |
endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
ifndef WHISPER_NO_ACCELERATE
|
| 109 |
# Mac M1 - include Accelerate framework
|
| 110 |
ifeq ($(UNAME_S),Darwin)
|
|
|
|
| 105 |
ifeq ($(UNAME_M),amd64)
|
| 106 |
CFLAGS += -mavx -mavx2 -mfma -mf16c
|
| 107 |
endif
|
| 108 |
+
ifeq ($(UNAME_M),ppc64le)
|
| 109 |
+
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
| 110 |
+
ifneq (,$(findstring POWER9,$(POWER9_M)))
|
| 111 |
+
CFLAGS += -mpower9-vector
|
| 112 |
+
endif
|
| 113 |
+
endif
|
| 114 |
ifndef WHISPER_NO_ACCELERATE
|
| 115 |
# Mac M1 - include Accelerate framework
|
| 116 |
ifeq ($(UNAME_S),Darwin)
|
ggml.c
CHANGED
|
@@ -138,8 +138,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|
| 138 |
#ifdef __wasm_simd128__
|
| 139 |
#include <wasm_simd128.h>
|
| 140 |
#else
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
#include <immintrin.h>
|
| 142 |
#endif
|
|
|
|
| 143 |
|
| 144 |
#ifdef __F16C__
|
| 145 |
float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
|
@@ -702,6 +708,57 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 702 |
//GGML_ASSERT(false);
|
| 703 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 704 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
#elif defined(__wasm_simd128__)
|
| 706 |
// WASM 128-bit
|
| 707 |
const int n16 = (n & ~15);
|
|
@@ -1063,6 +1120,63 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 1063 |
GGML_ASSERT(false);
|
| 1064 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1065 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1066 |
#elif defined(__wasm_simd128__)
|
| 1067 |
// WASM SIMD 128-bit
|
| 1068 |
const int n16 = (n & ~15);
|
|
|
|
| 138 |
#ifdef __wasm_simd128__
|
| 139 |
#include <wasm_simd128.h>
|
| 140 |
#else
|
| 141 |
+
#ifdef __POWER9_VECTOR__
|
| 142 |
+
#include <altivec.h>
|
| 143 |
+
#undef bool
|
| 144 |
+
#define bool _Bool
|
| 145 |
+
#else
|
| 146 |
#include <immintrin.h>
|
| 147 |
#endif
|
| 148 |
+
#endif
|
| 149 |
|
| 150 |
#ifdef __F16C__
|
| 151 |
float ggml_fp16_to_fp32(ggml_fp16_t h) {
|
|
|
|
| 708 |
//GGML_ASSERT(false);
|
| 709 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 710 |
}
|
| 711 |
+
#elif defined(__POWER9_VECTOR__)
|
| 712 |
+
const int n32 = (n & ~31);
|
| 713 |
+
|
| 714 |
+
vector float sum0 = vec_splats (0.0f);
|
| 715 |
+
|
| 716 |
+
for (int i = 0; i < n32; i += 32) {
|
| 717 |
+
// Use vec_xl, not vec_ld, because x is sometimes unaligned.
|
| 718 |
+
vector unsigned short x0 = vec_xl(i * 2 + 0, x);
|
| 719 |
+
vector unsigned short x1 = vec_xl(i * 2 + 16, x);
|
| 720 |
+
vector unsigned short x2 = vec_xl(i * 2 + 32, x);
|
| 721 |
+
vector unsigned short x3 = vec_xl(i * 2 + 48, x);
|
| 722 |
+
|
| 723 |
+
vector unsigned short y0 = vec_xl(i * 2 + 0, y);
|
| 724 |
+
vector unsigned short y1 = vec_xl(i * 2 + 16, y);
|
| 725 |
+
vector unsigned short y2 = vec_xl(i * 2 + 32, y);
|
| 726 |
+
vector unsigned short y3 = vec_xl(i * 2 + 48, y);
|
| 727 |
+
|
| 728 |
+
vector float fx0l = vec_extract_fp32_from_shortl(x0);
|
| 729 |
+
vector float fx0h = vec_extract_fp32_from_shorth(x0);
|
| 730 |
+
vector float fx1l = vec_extract_fp32_from_shortl(x1);
|
| 731 |
+
vector float fx1h = vec_extract_fp32_from_shorth(x1);
|
| 732 |
+
vector float fx2l = vec_extract_fp32_from_shortl(x2);
|
| 733 |
+
vector float fx2h = vec_extract_fp32_from_shorth(x2);
|
| 734 |
+
vector float fx3l = vec_extract_fp32_from_shortl(x3);
|
| 735 |
+
vector float fx3h = vec_extract_fp32_from_shorth(x3);
|
| 736 |
+
|
| 737 |
+
vector float fy0l = vec_extract_fp32_from_shortl(y0);
|
| 738 |
+
vector float fy0h = vec_extract_fp32_from_shorth(y0);
|
| 739 |
+
vector float fy1l = vec_extract_fp32_from_shortl(y1);
|
| 740 |
+
vector float fy1h = vec_extract_fp32_from_shorth(y1);
|
| 741 |
+
vector float fy2l = vec_extract_fp32_from_shortl(y2);
|
| 742 |
+
vector float fy2h = vec_extract_fp32_from_shorth(y2);
|
| 743 |
+
vector float fy3l = vec_extract_fp32_from_shortl(y3);
|
| 744 |
+
vector float fy3h = vec_extract_fp32_from_shorth(y3);
|
| 745 |
+
|
| 746 |
+
sum0 = vec_add(sum0, vec_mul(fx0l, fy0l));
|
| 747 |
+
sum0 = vec_add(sum0, vec_mul(fx0h, fy0h));
|
| 748 |
+
sum0 = vec_add(sum0, vec_mul(fx1l, fy1l));
|
| 749 |
+
sum0 = vec_add(sum0, vec_mul(fx1h, fy1h));
|
| 750 |
+
sum0 = vec_add(sum0, vec_mul(fx2l, fy2l));
|
| 751 |
+
sum0 = vec_add(sum0, vec_mul(fx2h, fy2h));
|
| 752 |
+
sum0 = vec_add(sum0, vec_mul(fx3l, fy3l));
|
| 753 |
+
sum0 = vec_add(sum0, vec_mul(fx3h, fy3h));
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
|
| 757 |
+
+ vec_extract(sum0, 2) + vec_extract(sum0, 3);
|
| 758 |
+
|
| 759 |
+
for (int i = n32; i < n; ++i) {
|
| 760 |
+
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 761 |
+
}
|
| 762 |
#elif defined(__wasm_simd128__)
|
| 763 |
// WASM 128-bit
|
| 764 |
const int n16 = (n & ~15);
|
|
|
|
| 1120 |
GGML_ASSERT(false);
|
| 1121 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1122 |
}
|
| 1123 |
+
#elif defined(__POWER9_VECTOR__)
|
| 1124 |
+
const int n32 = (n & ~31);
|
| 1125 |
+
for (int i = 0; i < n32; i += 32) {
|
| 1126 |
+
// Use vec_xl, not vec_ld, because x is sometimes unaligned!
|
| 1127 |
+
vector unsigned short x0 = vec_xl(i * 2 + 0, x);
|
| 1128 |
+
vector unsigned short x1 = vec_xl(i * 2 + 16, x);
|
| 1129 |
+
vector unsigned short x2 = vec_xl(i * 2 + 32, x);
|
| 1130 |
+
vector unsigned short x3 = vec_xl(i * 2 + 48, x);
|
| 1131 |
+
|
| 1132 |
+
vector unsigned short y0 = vec_xl(i * 2 + 0, y);
|
| 1133 |
+
vector unsigned short y1 = vec_xl(i * 2 + 16, y);
|
| 1134 |
+
vector unsigned short y2 = vec_xl(i * 2 + 32, y);
|
| 1135 |
+
vector unsigned short y3 = vec_xl(i * 2 + 48, y);
|
| 1136 |
+
|
| 1137 |
+
vector float v4 = vec_splats(v);
|
| 1138 |
+
|
| 1139 |
+
vector float fx0l = vec_extract_fp32_from_shortl(x0);
|
| 1140 |
+
vector float fx0h = vec_extract_fp32_from_shorth(x0);
|
| 1141 |
+
vector float fx1l = vec_extract_fp32_from_shortl(x1);
|
| 1142 |
+
vector float fx1h = vec_extract_fp32_from_shorth(x1);
|
| 1143 |
+
vector float fx2l = vec_extract_fp32_from_shortl(x2);
|
| 1144 |
+
vector float fx2h = vec_extract_fp32_from_shorth(x2);
|
| 1145 |
+
vector float fx3l = vec_extract_fp32_from_shortl(x3);
|
| 1146 |
+
vector float fx3h = vec_extract_fp32_from_shorth(x3);
|
| 1147 |
+
|
| 1148 |
+
vector float fy0l = vec_extract_fp32_from_shortl(y0);
|
| 1149 |
+
vector float fy0h = vec_extract_fp32_from_shorth(y0);
|
| 1150 |
+
vector float fy1l = vec_extract_fp32_from_shortl(y1);
|
| 1151 |
+
vector float fy1h = vec_extract_fp32_from_shorth(y1);
|
| 1152 |
+
vector float fy2l = vec_extract_fp32_from_shortl(y2);
|
| 1153 |
+
vector float fy2h = vec_extract_fp32_from_shorth(y2);
|
| 1154 |
+
vector float fy3l = vec_extract_fp32_from_shortl(y3);
|
| 1155 |
+
vector float fy3h = vec_extract_fp32_from_shorth(y3);
|
| 1156 |
+
|
| 1157 |
+
fy0l = vec_madd(fx0l, v4, fy0l);
|
| 1158 |
+
fy0h = vec_madd(fx0h, v4, fy0h);
|
| 1159 |
+
fy1l = vec_madd(fx1l, v4, fy1l);
|
| 1160 |
+
fy1h = vec_madd(fx1h, v4, fy1h);
|
| 1161 |
+
fy2l = vec_madd(fx2l, v4, fy2l);
|
| 1162 |
+
fy2h = vec_madd(fx2h, v4, fy2h);
|
| 1163 |
+
fy3l = vec_madd(fx3l, v4, fy3l);
|
| 1164 |
+
fy3h = vec_madd(fx3h, v4, fy3h);
|
| 1165 |
+
|
| 1166 |
+
y0 = vec_pack_to_short_fp32(fy0h, fy0l);
|
| 1167 |
+
y1 = vec_pack_to_short_fp32(fy1h, fy1l);
|
| 1168 |
+
y2 = vec_pack_to_short_fp32(fy2h, fy2l);
|
| 1169 |
+
y3 = vec_pack_to_short_fp32(fy3h, fy3l);
|
| 1170 |
+
|
| 1171 |
+
vec_xst(y0, i * 2 + 0, y);
|
| 1172 |
+
vec_xst(y1, i * 2 + 16, y);
|
| 1173 |
+
vec_xst(y2, i * 2 + 32, y);
|
| 1174 |
+
vec_xst(y3, i * 2 + 48, y);
|
| 1175 |
+
}
|
| 1176 |
+
|
| 1177 |
+
for (int i = n32; i < n; ++i) {
|
| 1178 |
+
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 1179 |
+
}
|
| 1180 |
#elif defined(__wasm_simd128__)
|
| 1181 |
// WASM SIMD 128-bit
|
| 1182 |
const int n16 = (n & ~15);
|