Spaces:
Sleeping
Sleeping
Add OpenBLAS support
Browse filesSupported via CMake - just add:
cmake .. -DWHISPER_SUPPORT_OPENBLAS=ON
On Ubuntu, you have to install the library like this:
apt install libopenblas-dev
Unfortunately, I don't observe any benefit compared to the
original AVX2 + FP16 implementation. Maybe I'm missing something
- CMakeLists.txt +18 -1
- ggml.c +43 -43
CMakeLists.txt
CHANGED
|
@@ -41,8 +41,13 @@ option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STAND
|
|
| 41 |
|
| 42 |
option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
option(WHISPER_PERF "whisper: enable perf timings" OFF)
|
| 45 |
-
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
|
| 46 |
|
| 47 |
# sanitizers
|
| 48 |
|
|
@@ -86,6 +91,18 @@ if (APPLE AND NOT WHISPER_NO_ACCELERATE)
|
|
| 86 |
endif()
|
| 87 |
endif()
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
# compiler flags
|
| 90 |
|
| 91 |
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
|
|
|
| 41 |
|
| 42 |
option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF)
|
| 43 |
|
| 44 |
+
if (APPLE)
|
| 45 |
+
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
|
| 46 |
+
else()
|
| 47 |
+
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
|
| 48 |
+
endif()
|
| 49 |
+
|
| 50 |
option(WHISPER_PERF "whisper: enable perf timings" OFF)
|
|
|
|
| 51 |
|
| 52 |
# sanitizers
|
| 53 |
|
|
|
|
| 91 |
endif()
|
| 92 |
endif()
|
| 93 |
|
| 94 |
+
if (WHISPER_SUPPORT_OPENBLAS)
|
| 95 |
+
find_library(OPENBLAS_LIB openblas)
|
| 96 |
+
if (OPENBLAS_LIB)
|
| 97 |
+
message(STATUS "OpenBLAS found")
|
| 98 |
+
|
| 99 |
+
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${OPENBLAS_LIB})
|
| 100 |
+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
|
| 101 |
+
else()
|
| 102 |
+
message(WARNING "OpenBLAS not found")
|
| 103 |
+
endif()
|
| 104 |
+
endif()
|
| 105 |
+
|
| 106 |
# compiler flags
|
| 107 |
|
| 108 |
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
ggml.c
CHANGED
|
@@ -76,6 +76,8 @@ typedef void* thread_ret_t;
|
|
| 76 |
|
| 77 |
#ifdef GGML_USE_ACCELERATE
|
| 78 |
#include <Accelerate/Accelerate.h>
|
|
|
|
|
|
|
| 79 |
#endif
|
| 80 |
|
| 81 |
// floating point type used to accumulate sums
|
|
@@ -4055,46 +4057,44 @@ void ggml_compute_forward_mul_mat_f32(
|
|
| 4055 |
// nb00 < nb01 - src0 is transposed
|
| 4056 |
// compute by src0 columns
|
| 4057 |
|
| 4058 |
-
|
| 4059 |
-
|
| 4060 |
-
|
| 4061 |
-
|
| 4062 |
-
|
| 4063 |
-
|
| 4064 |
-
|
| 4065 |
-
|
| 4066 |
-
|
| 4067 |
-
|
| 4068 |
-
|
| 4069 |
-
|
| 4070 |
-
|
| 4071 |
-
|
| 4072 |
-
|
| 4073 |
-
|
| 4074 |
-
|
| 4075 |
-
|
| 4076 |
-
|
| 4077 |
-
|
| 4078 |
-
|
| 4079 |
-
|
| 4080 |
-
//
|
| 4081 |
-
|
| 4082 |
-
|
| 4083 |
-
|
| 4084 |
-
|
| 4085 |
-
|
| 4086 |
-
|
| 4087 |
-
|
| 4088 |
-
|
| 4089 |
-
|
| 4090 |
-
|
| 4091 |
-
//
|
| 4092 |
-
|
| 4093 |
-
|
| 4094 |
-
|
| 4095 |
-
|
| 4096 |
-
// }
|
| 4097 |
-
//#endif
|
| 4098 |
|
| 4099 |
if (params->type == GGML_TASK_INIT) {
|
| 4100 |
if (nb01 >= nb00) {
|
|
@@ -4301,7 +4301,7 @@ void ggml_compute_forward_mul_mat_f16_f32(
|
|
| 4301 |
// nb00 < nb01 - src0 is transposed
|
| 4302 |
// compute by src0 columns
|
| 4303 |
|
| 4304 |
-
#
|
| 4305 |
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 4306 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 4307 |
|
|
@@ -6857,7 +6857,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
| 6857 |
} else {
|
| 6858 |
if (node->src0->type == GGML_TYPE_F16 &&
|
| 6859 |
node->src1->type == GGML_TYPE_F32) {
|
| 6860 |
-
#
|
| 6861 |
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 6862 |
cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
|
| 6863 |
} else {
|
|
@@ -8074,7 +8074,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
| 8074 |
}
|
| 8075 |
|
| 8076 |
int ggml_cpu_has_blas(void) {
|
| 8077 |
-
#if defined(
|
| 8078 |
return 1;
|
| 8079 |
#else
|
| 8080 |
return 0;
|
|
|
|
| 76 |
|
| 77 |
#ifdef GGML_USE_ACCELERATE
|
| 78 |
#include <Accelerate/Accelerate.h>
|
| 79 |
+
#elif GGML_USE_OPENBLAS
|
| 80 |
+
#include <cblas.h>
|
| 81 |
#endif
|
| 82 |
|
| 83 |
// floating point type used to accumulate sums
|
|
|
|
| 4057 |
// nb00 < nb01 - src0 is transposed
|
| 4058 |
// compute by src0 columns
|
| 4059 |
|
| 4060 |
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 4061 |
+
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 4062 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
| 4063 |
+
GGML_ASSERT(nb10 == sizeof(float));
|
| 4064 |
+
|
| 4065 |
+
if (params->ith != 0) return;
|
| 4066 |
+
|
| 4067 |
+
if (params->type == GGML_TASK_INIT) {
|
| 4068 |
+
return;
|
| 4069 |
+
}
|
| 4070 |
+
|
| 4071 |
+
if (params->type == GGML_TASK_FINALIZE) {
|
| 4072 |
+
return;
|
| 4073 |
+
}
|
| 4074 |
+
|
| 4075 |
+
for (int i03 = 0; i03 < ne03; i03++) {
|
| 4076 |
+
for (int i02 = 0; i02 < ne02; i02++) {
|
| 4077 |
+
const float * x = (float *) (src0->data);
|
| 4078 |
+
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
| 4079 |
+
|
| 4080 |
+
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
| 4081 |
+
|
| 4082 |
+
// zT = y * xT
|
| 4083 |
+
{
|
| 4084 |
+
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
| 4085 |
+
ne11, ne01, ne10,
|
| 4086 |
+
1.0f, y, ne10,
|
| 4087 |
+
x, ne10,
|
| 4088 |
+
0.0f, d, ne01);
|
| 4089 |
+
}
|
| 4090 |
+
}
|
| 4091 |
+
}
|
| 4092 |
+
|
| 4093 |
+
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
| 4094 |
+
|
| 4095 |
+
return;
|
| 4096 |
+
}
|
| 4097 |
+
#endif
|
|
|
|
|
|
|
| 4098 |
|
| 4099 |
if (params->type == GGML_TASK_INIT) {
|
| 4100 |
if (nb01 >= nb00) {
|
|
|
|
| 4301 |
// nb00 < nb01 - src0 is transposed
|
| 4302 |
// compute by src0 columns
|
| 4303 |
|
| 4304 |
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 4305 |
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
| 4306 |
GGML_ASSERT(nb10 == sizeof(float));
|
| 4307 |
|
|
|
|
| 6857 |
} else {
|
| 6858 |
if (node->src0->type == GGML_TYPE_F16 &&
|
| 6859 |
node->src1->type == GGML_TYPE_F32) {
|
| 6860 |
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 6861 |
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
| 6862 |
cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
|
| 6863 |
} else {
|
|
|
|
| 8074 |
}
|
| 8075 |
|
| 8076 |
int ggml_cpu_has_blas(void) {
|
| 8077 |
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
| 8078 |
return 1;
|
| 8079 |
#else
|
| 8080 |
return 0;
|