ggerganov commited on
Commit
b69a5a5
·
1 Parent(s): 4c78bf0

wip : rpi4 support

Browse files
Files changed (4) hide show
  1. Makefile +16 -7
  2. ggml.c +123 -7
  3. ggml.h +1 -1
  4. whisper.cpp +11 -6
Makefile CHANGED
@@ -15,10 +15,12 @@ CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
15
  # OS specific
16
  # TODO: support Windows
17
  ifeq ($(UNAME_S),Linux)
18
- CFLAGS += -pthread
 
19
  endif
20
  ifeq ($(UNAME_S),Darwin)
21
- CFLAGS += -pthread
 
22
  endif
23
 
24
  # Architecture specific
@@ -26,14 +28,21 @@ ifeq ($(UNAME_P),x86_64)
26
  CFLAGS += -mavx -mavx2 -mfma -mf16c
27
  endif
28
  ifneq ($(filter arm%,$(UNAME_P)),)
29
- CFLAGS += -mfpu=neon
30
  endif
31
- ifneq ($(filter aarch64%,$(UNAME_M)),)
32
- CFLAGS += -mfpu=neon
 
 
 
33
  endif
34
- ifneq ($(filter armv%,$(UNAME_M)),)
35
  # Raspberry Pi 4
36
- CFLAGS += -mcpu=cortex-a72 -mfloat-abi=hard -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 
 
 
 
37
  endif
38
 
39
  #
 
15
  # OS specific
16
  # TODO: support Windows
17
  ifeq ($(UNAME_S),Linux)
18
+ CFLAGS += -pthread
19
+ CXXFLAGS += -pthread
20
  endif
21
  ifeq ($(UNAME_S),Darwin)
22
+ CFLAGS += -pthread
23
+ CXXFLAGS += -pthread
24
  endif
25
 
26
  # Architecture specific
 
28
  CFLAGS += -mavx -mavx2 -mfma -mf16c
29
  endif
30
  ifneq ($(filter arm%,$(UNAME_P)),)
31
+ # Mac M1
32
  endif
33
+ ifneq ($(filter aarch64%,$(UNAME_P)),)
34
+ endif
35
+ ifneq ($(filter armv6%,$(UNAME_M)),)
36
+ # Raspberry Pi 1, 2, 3
37
+ CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
38
  endif
39
+ ifneq ($(filter armv7%,$(UNAME_M)),)
40
  # Raspberry Pi 4
41
+ CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
42
+ endif
43
+ ifneq ($(filter armv8%,$(UNAME_M)),)
44
+ # Raspberry Pi 4
45
+ CFLAGS += -mfp16-format=ieee -mno-unaligned-access
46
  endif
47
 
48
  #
ggml.c CHANGED
@@ -1,5 +1,6 @@
1
  #include "ggml.h"
2
 
 
3
  #include <assert.h>
4
  #include <time.h>
5
  #include <math.h>
@@ -12,7 +13,12 @@
12
  #include <pthread.h>
13
 
14
  #define GGML_DEBUG 0
15
- #define GGML_MEM_ALIGN 16
 
 
 
 
 
16
 
17
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
18
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -305,6 +311,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
305
  #ifdef __ARM_NEON
306
  const int n32 = (n & ~31);
307
 
 
308
  float16x8_t sum0 = vdupq_n_f16(0);
309
  float16x8_t sum1 = vdupq_n_f16(0);
310
  float16x8_t sum2 = vdupq_n_f16(0);
@@ -344,6 +351,61 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
344
 
345
  float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
346
  sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  // leftovers
349
  for (int i = n32; i < n; ++i) {
@@ -486,6 +548,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
486
  // NEON 128-bit
487
  const int n32 = (n & ~31);
488
 
 
489
  const float16x8_t v8 = vdupq_n_f16(v);
490
 
491
  float16x8_t x0, x1, x2, x3;
@@ -512,6 +575,51 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
512
  vst1q_f16(y + i + 16, y2);
513
  vst1q_f16(y + i + 24, y3);
514
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
 
516
  // leftovers
517
  for (int i = n32; i < n; ++i) {
@@ -911,16 +1019,18 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
911
  if (is_first_call) {
912
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
913
 
 
914
  for (int i = 0; i < (1 << 16); ++i) {
915
- uint16_t ii = (uint16_t) i;
916
- const float f = ggml_fp16_to_fp32(*(ggml_fp16_t *)(&ii));
 
917
  table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
918
  table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
919
  }
920
 
921
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
922
 
923
- GGML_PRINT_DEBUG("%s: GELU table initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
924
 
925
  is_first_call = false;
926
  }
@@ -4427,13 +4537,15 @@ void ggml_compute_forward_soft_max_f32(
4427
 
4428
  ggml_float sum = 0.0;
4429
 
 
4430
  for (int i = 0; i < nc; i++) {
4431
  if (p[i] == -INFINITY) {
4432
  p[i] = 0.0;
4433
  } else {
4434
  //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
4435
  ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
4436
- const float val = ggml_fp16_to_fp32(table_exp_f16[*(uint16_t *) &s]);
 
4437
  sum += val;
4438
  p[i] = val;
4439
  }
@@ -5234,13 +5346,15 @@ void ggml_compute_forward_flash_attn_f32(
5234
 
5235
  ggml_float sum = 0.0;
5236
 
 
5237
  for (int i = 0; i < M; i++) {
5238
  if (S[i] == -INFINITY) {
5239
  S[i] = 0.0;
5240
  } else {
5241
  //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
5242
  ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
5243
- const float val = ggml_fp16_to_fp32(table_exp_f16[*(uint16_t *) &s]);
 
5244
  sum += val;
5245
  S[i] = val;
5246
  }
@@ -5413,13 +5527,15 @@ void ggml_compute_forward_flash_attn_f16(
5413
 
5414
  ggml_float sum = 0.0;
5415
 
 
5416
  for (int i = 0; i < M; i++) {
5417
  if (S[i] == -INFINITY) {
5418
  S[i] = 0.0;
5419
  } else {
5420
  //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
5421
  ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
5422
- const float val = ggml_fp16_to_fp32(table_exp_f16[*(uint16_t *) &s]);
 
5423
  sum += val;
5424
  S[i] = val;
5425
  }
 
1
  #include "ggml.h"
2
 
3
+ #include <alloca.h>
4
  #include <assert.h>
5
  #include <time.h>
6
  #include <math.h>
 
13
  #include <pthread.h>
14
 
15
  #define GGML_DEBUG 0
16
+
17
+ #if UINTPTR_MAX == 0xFFFFFFFF
18
+ #define GGML_MEM_ALIGN 4
19
+ #else
20
+ #define GGML_MEM_ALIGN 16
21
+ #endif
22
 
23
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
24
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
 
311
  #ifdef __ARM_NEON
312
  const int n32 = (n & ~31);
313
 
314
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
315
  float16x8_t sum0 = vdupq_n_f16(0);
316
  float16x8_t sum1 = vdupq_n_f16(0);
317
  float16x8_t sum2 = vdupq_n_f16(0);
 
351
 
352
  float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0f32), vget_high_f32(sum0f32));
353
  sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
354
+ #else
355
+ float32x4_t sum0 = vdupq_n_f32(0);
356
+ float32x4_t sum1 = vdupq_n_f32(0);
357
+ float32x4_t sum2 = vdupq_n_f32(0);
358
+ float32x4_t sum3 = vdupq_n_f32(0);
359
+ float32x4_t sum4 = vdupq_n_f32(0);
360
+ float32x4_t sum5 = vdupq_n_f32(0);
361
+ float32x4_t sum6 = vdupq_n_f32(0);
362
+ float32x4_t sum7 = vdupq_n_f32(0);
363
+
364
+ float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
365
+ float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
366
+
367
+ for (int i = 0; i < n32; i += 32) {
368
+ x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
369
+ x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
370
+ x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
371
+ x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
372
+ x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
373
+ x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
374
+ x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
375
+ x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
376
+
377
+ y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
378
+ y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
379
+ y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
380
+ y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
381
+ y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
382
+ y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
383
+ y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
384
+ y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
385
+
386
+ sum0 = vfmaq_f32(sum0, x0, y0);
387
+ sum1 = vfmaq_f32(sum1, x1, y1);
388
+ sum2 = vfmaq_f32(sum2, x2, y2);
389
+ sum3 = vfmaq_f32(sum3, x3, y3);
390
+ sum4 = vfmaq_f32(sum4, x4, y4);
391
+ sum5 = vfmaq_f32(sum5, x5, y5);
392
+ sum6 = vfmaq_f32(sum6, x6, y6);
393
+ sum7 = vfmaq_f32(sum7, x7, y7);
394
+ }
395
+
396
+ // reduce sum0..sum7 to sum0
397
+ sum0 = vaddq_f32(sum0, sum1);
398
+ sum2 = vaddq_f32(sum2, sum3);
399
+ sum4 = vaddq_f32(sum4, sum5);
400
+ sum6 = vaddq_f32(sum6, sum7);
401
+ sum0 = vaddq_f32(sum0, sum2);
402
+ sum4 = vaddq_f32(sum4, sum6);
403
+ sum0 = vaddq_f32(sum0, sum4);
404
+
405
+ // reduce sum0 to sumf
406
+ float32x2_t sumf32 = vadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
407
+ sumf = vget_lane_f32(sumf32, 0) + vget_lane_f32(sumf32, 1);
408
+ #endif
409
 
410
  // leftovers
411
  for (int i = n32; i < n; ++i) {
 
548
  // NEON 128-bit
549
  const int n32 = (n & ~31);
550
 
551
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
552
  const float16x8_t v8 = vdupq_n_f16(v);
553
 
554
  float16x8_t x0, x1, x2, x3;
 
575
  vst1q_f16(y + i + 16, y2);
576
  vst1q_f16(y + i + 24, y3);
577
  }
578
+ #else
579
+ const float32x4_t v40 = vdupq_n_f32(v);
580
+ const float32x4_t v41 = vdupq_n_f32(v);
581
+
582
+ float32x4_t x0, x1, x2, x3, x4, x5, x6, x7;
583
+ float32x4_t y0, y1, y2, y3, y4, y5, y6, y7;
584
+
585
+ for (int i = 0; i < n32; i += 32) {
586
+ y0 = vcvt_f32_f16(vld1_f16(y + i + 0 ));
587
+ y1 = vcvt_f32_f16(vld1_f16(y + i + 4 ));
588
+ y2 = vcvt_f32_f16(vld1_f16(y + i + 8 ));
589
+ y3 = vcvt_f32_f16(vld1_f16(y + i + 12));
590
+ y4 = vcvt_f32_f16(vld1_f16(y + i + 16));
591
+ y5 = vcvt_f32_f16(vld1_f16(y + i + 20));
592
+ y6 = vcvt_f32_f16(vld1_f16(y + i + 24));
593
+ y7 = vcvt_f32_f16(vld1_f16(y + i + 28));
594
+
595
+ x0 = vcvt_f32_f16(vld1_f16(x + i + 0 ));
596
+ x1 = vcvt_f32_f16(vld1_f16(x + i + 4 ));
597
+ x2 = vcvt_f32_f16(vld1_f16(x + i + 8 ));
598
+ x3 = vcvt_f32_f16(vld1_f16(x + i + 12));
599
+ x4 = vcvt_f32_f16(vld1_f16(x + i + 16));
600
+ x5 = vcvt_f32_f16(vld1_f16(x + i + 20));
601
+ x6 = vcvt_f32_f16(vld1_f16(x + i + 24));
602
+ x7 = vcvt_f32_f16(vld1_f16(x + i + 28));
603
+
604
+ y0 = vfmaq_f32(y0, x0, v40);
605
+ y1 = vfmaq_f32(y1, x1, v40);
606
+ y2 = vfmaq_f32(y2, x2, v40);
607
+ y3 = vfmaq_f32(y3, x3, v40);
608
+ y4 = vfmaq_f32(y4, x4, v41);
609
+ y5 = vfmaq_f32(y5, x5, v41);
610
+ y6 = vfmaq_f32(y6, x6, v41);
611
+ y7 = vfmaq_f32(y7, x7, v41);
612
+
613
+ vst1_f16(y + i + 0 , vcvt_f16_f32(y0));
614
+ vst1_f16(y + i + 4 , vcvt_f16_f32(y1));
615
+ vst1_f16(y + i + 8 , vcvt_f16_f32(y2));
616
+ vst1_f16(y + i + 12, vcvt_f16_f32(y3));
617
+ vst1_f16(y + i + 16, vcvt_f16_f32(y4));
618
+ vst1_f16(y + i + 20, vcvt_f16_f32(y5));
619
+ vst1_f16(y + i + 24, vcvt_f16_f32(y6));
620
+ vst1_f16(y + i + 28, vcvt_f16_f32(y7));
621
+ }
622
+ #endif
623
 
624
  // leftovers
625
  for (int i = n32; i < n; ++i) {
 
1019
  if (is_first_call) {
1020
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
1021
 
1022
+ ggml_fp16_t ii;
1023
  for (int i = 0; i < (1 << 16); ++i) {
1024
+ uint16_t ui = i;
1025
+ memcpy(&ii, &ui, sizeof(ii));
1026
+ const float f = ggml_fp16_to_fp32(ii);
1027
  table_gelu_f16[i] = ggml_fp32_to_fp16(ggml_gelu_f32(f));
1028
  table_exp_f16[i] = ggml_fp32_to_fp16(exp(f));
1029
  }
1030
 
1031
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
1032
 
1033
+ GGML_PRINT_DEBUG("%s: GELU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
1034
 
1035
  is_first_call = false;
1036
  }
 
4537
 
4538
  ggml_float sum = 0.0;
4539
 
4540
+ uint16_t ss;
4541
  for (int i = 0; i < nc; i++) {
4542
  if (p[i] == -INFINITY) {
4543
  p[i] = 0.0;
4544
  } else {
4545
  //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
4546
  ggml_fp16_t s = ggml_fp32_to_fp16(p[i] - max);
4547
+ memcpy(&ss, &s, sizeof(ss));
4548
+ const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
4549
  sum += val;
4550
  p[i] = val;
4551
  }
 
5346
 
5347
  ggml_float sum = 0.0;
5348
 
5349
+ uint16_t ss;
5350
  for (int i = 0; i < M; i++) {
5351
  if (S[i] == -INFINITY) {
5352
  S[i] = 0.0;
5353
  } else {
5354
  //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
5355
  ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
5356
+ memcpy(&ss, &s, sizeof(ss));
5357
+ const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
5358
  sum += val;
5359
  S[i] = val;
5360
  }
 
5527
 
5528
  ggml_float sum = 0.0;
5529
 
5530
+ uint16_t ss;
5531
  for (int i = 0; i < M; i++) {
5532
  if (S[i] == -INFINITY) {
5533
  S[i] = 0.0;
5534
  } else {
5535
  //const float val = (S[i] == -INFINITY) ? 0.0 : exp(S[i] - max);
5536
  ggml_fp16_t s = ggml_fp32_to_fp16(S[i] - max);
5537
+ memcpy(&ss, &s, sizeof(ss));
5538
+ const float val = ggml_fp16_to_fp32(table_exp_f16[ss]);
5539
  sum += val;
5540
  S[i] = val;
5541
  }
ggml.h CHANGED
@@ -108,7 +108,7 @@ struct ggml_tensor {
108
  int64_t perf_time_us;
109
 
110
  void * data;
111
- char pad[8];
112
  };
113
 
114
  // computation graph
 
108
  int64_t perf_time_us;
109
 
110
  void * data;
111
+ char padding[8];
112
  };
113
 
114
  // computation graph
whisper.cpp CHANGED
@@ -1291,7 +1291,8 @@ bool whisper_encode(
1291
  struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF);
1292
 
1293
  {
1294
- struct ggml_cgraph gf = { .n_threads = n_threads };
 
1295
 
1296
  ggml_build_forward_expand(&gf, inpO);
1297
  ggml_graph_compute (ctxL, &gf);
@@ -1327,7 +1328,8 @@ bool whisper_encode(
1327
 
1328
  // run the computation
1329
  {
1330
- struct ggml_cgraph gf = { .n_threads = n_threads };
 
1331
 
1332
  ggml_build_forward_expand(&gf, cur);
1333
  ggml_graph_compute (ctx0, &gf);
@@ -1351,7 +1353,8 @@ bool whisper_encode(
1351
 
1352
  // pre-compute cross-attention memory
1353
  {
1354
- struct ggml_cgraph gf = { .n_threads = n_threads };
 
1355
 
1356
  // TODO: hack to disconnect the encoded features from the previous graph
1357
  cur->op = GGML_OP_NONE;
@@ -1461,7 +1464,8 @@ bool whisper_decode(
1461
  };
1462
 
1463
  struct ggml_context * ctxL = ggml_init(paramsL);
1464
- struct ggml_cgraph gf = { .n_threads = n_threads };
 
1465
 
1466
  // norm
1467
  {
@@ -1744,7 +1748,8 @@ bool whisper_decode(
1744
 
1745
  // run the computation
1746
  {
1747
- struct ggml_cgraph gf = { .n_threads = n_threads };
 
1748
 
1749
  ggml_build_forward_expand(&gf, cur);
1750
  ggml_graph_compute (ctx0, &gf);
@@ -2334,7 +2339,7 @@ int whisper_full(
2334
  }
2335
  }
2336
 
2337
- if (seek >= whisper_n_len(ctx)) {
2338
  break;
2339
  }
2340
 
 
1291
  struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF);
1292
 
1293
  {
1294
+ struct ggml_cgraph gf = {};
1295
+ gf.n_threads = n_threads;
1296
 
1297
  ggml_build_forward_expand(&gf, inpO);
1298
  ggml_graph_compute (ctxL, &gf);
 
1328
 
1329
  // run the computation
1330
  {
1331
+ struct ggml_cgraph gf = {};
1332
+ gf.n_threads = n_threads;
1333
 
1334
  ggml_build_forward_expand(&gf, cur);
1335
  ggml_graph_compute (ctx0, &gf);
 
1353
 
1354
  // pre-compute cross-attention memory
1355
  {
1356
+ struct ggml_cgraph gf = {};
1357
+ gf.n_threads = n_threads;
1358
 
1359
  // TODO: hack to disconnect the encoded features from the previous graph
1360
  cur->op = GGML_OP_NONE;
 
1464
  };
1465
 
1466
  struct ggml_context * ctxL = ggml_init(paramsL);
1467
+ struct ggml_cgraph gf = {};
1468
+ gf.n_threads = n_threads;
1469
 
1470
  // norm
1471
  {
 
1748
 
1749
  // run the computation
1750
  {
1751
+ struct ggml_cgraph gf = {};
1752
+ gf.n_threads = n_threads;
1753
 
1754
  ggml_build_forward_expand(&gf, cur);
1755
  ggml_graph_compute (ctx0, &gf);
 
2339
  }
2340
  }
2341
 
2342
+ if (seek + 100 >= whisper_n_len(ctx)) {
2343
  break;
2344
  }
2345