ggerganov commited on
Commit
bec875e
·
1 Parent(s): e86d64e

whisper : add option to speed up the audio tempo by x2

Browse files

Using a Phase Vocoder for speeding up the audio tempo by scaling down
the frequencies in the frequency domain.

This reduces the computation in the Encoder by a factor of 2.
The transcription accuracy is degraded, but for slow to normal speech -
it seems to be still very good.

I think this can find application for real-time transcription - i.e. the
"stream" example.

examples/main/main.cpp CHANGED
@@ -59,6 +59,7 @@ struct whisper_params {
59
 
60
  float word_thold = 0.01f;
61
 
 
62
  bool verbose = false;
63
  bool translate = false;
64
  bool output_txt = false;
@@ -104,6 +105,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
104
  params.max_len = std::stoi(argv[++i]);
105
  } else if (arg == "-wt" || arg == "--word-thold") {
106
  params.word_thold = std::stof(argv[++i]);
 
 
107
  } else if (arg == "-v" || arg == "--verbose") {
108
  params.verbose = true;
109
  } else if (arg == "--translate") {
@@ -161,6 +164,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
161
  fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
162
  fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
163
  fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
 
164
  fprintf(stderr, " -v, --verbose verbose output\n");
165
  fprintf(stderr, " --translate translate from source language to english\n");
166
  fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
@@ -454,7 +458,7 @@ int main(int argc, char ** argv) {
454
  std::vector<float> pcmf32;
455
  {
456
  drwav wav;
457
-
458
  if (fname_inp == "-") {
459
  std::vector<uint8_t> wav_data;
460
  {
@@ -563,6 +567,8 @@ int main(int argc, char ** argv) {
563
  wparams.thold_pt = params.word_thold;
564
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
565
 
 
 
566
  // this callback is called on each new segment
567
  if (!wparams.print_realtime) {
568
  wparams.new_segment_callback = whisper_print_segment_callback;
 
59
 
60
  float word_thold = 0.01f;
61
 
62
+ bool speed_up = false;
63
  bool verbose = false;
64
  bool translate = false;
65
  bool output_txt = false;
 
105
  params.max_len = std::stoi(argv[++i]);
106
  } else if (arg == "-wt" || arg == "--word-thold") {
107
  params.word_thold = std::stof(argv[++i]);
108
+ } else if (arg == "-su" || arg == "--speed-up") {
109
+ params.speed_up = true;
110
  } else if (arg == "-v" || arg == "--verbose") {
111
  params.verbose = true;
112
  } else if (arg == "--translate") {
 
164
  fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
165
  fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
166
  fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
167
+ fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
168
  fprintf(stderr, " -v, --verbose verbose output\n");
169
  fprintf(stderr, " --translate translate from source language to english\n");
170
  fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
 
458
  std::vector<float> pcmf32;
459
  {
460
  drwav wav;
461
+
462
  if (fname_inp == "-") {
463
  std::vector<uint8_t> wav_data;
464
  {
 
567
  wparams.thold_pt = params.word_thold;
568
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
569
 
570
+ wparams.speed_up = params.speed_up;
571
+
572
  // this callback is called on each new segment
573
  if (!wparams.print_realtime) {
574
  wparams.new_segment_callback = whisper_print_segment_callback;
examples/stream/stream.cpp CHANGED
@@ -41,6 +41,7 @@ struct whisper_params {
41
  int32_t length_ms = 10000;
42
  int32_t capture_id = -1;
43
 
 
44
  bool verbose = false;
45
  bool translate = false;
46
  bool no_context = true;
@@ -68,6 +69,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
68
  params.length_ms = std::stoi(argv[++i]);
69
  } else if (arg == "-c" || arg == "--capture") {
70
  params.capture_id = std::stoi(argv[++i]);
 
 
71
  } else if (arg == "-v" || arg == "--verbose") {
72
  params.verbose = true;
73
  } else if (arg == "--translate") {
@@ -113,6 +116,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
113
  fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
114
  fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
115
  fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
 
116
  fprintf(stderr, " -v, --verbose verbose output\n");
117
  fprintf(stderr, " --translate translate from source language to english\n");
118
  fprintf(stderr, " -kc, --keep-context keep text context from earlier audio (default: false)\n");
@@ -326,6 +330,8 @@ int main(int argc, char ** argv) {
326
  wparams.language = params.language.c_str();
327
  wparams.n_threads = params.n_threads;
328
 
 
 
329
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
330
  fprintf(stderr, "%s: failed to process audio\n", argv[0]);
331
  return 6;
 
41
  int32_t length_ms = 10000;
42
  int32_t capture_id = -1;
43
 
44
+ bool speed_up = false;
45
  bool verbose = false;
46
  bool translate = false;
47
  bool no_context = true;
 
69
  params.length_ms = std::stoi(argv[++i]);
70
  } else if (arg == "-c" || arg == "--capture") {
71
  params.capture_id = std::stoi(argv[++i]);
72
+ } else if (arg == "-su" || arg == "--speed-up") {
73
+ params.speed_up = true;
74
  } else if (arg == "-v" || arg == "--verbose") {
75
  params.verbose = true;
76
  } else if (arg == "--translate") {
 
116
  fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
117
  fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
118
  fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
119
+ fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
120
  fprintf(stderr, " -v, --verbose verbose output\n");
121
  fprintf(stderr, " --translate translate from source language to english\n");
122
  fprintf(stderr, " -kc, --keep-context keep text context from earlier audio (default: false)\n");
 
330
  wparams.language = params.language.c_str();
331
  wparams.n_threads = params.n_threads;
332
 
333
+ wparams.speed_up = params.speed_up;
334
+
335
  if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
336
  fprintf(stderr, "%s: failed to process audio\n", argv[0]);
337
  return 6;
whisper.cpp CHANGED
@@ -2031,6 +2031,7 @@ static bool log_mel_spectrogram(
2031
  const int n_mel,
2032
  const int n_threads,
2033
  const whisper_filters & filters,
 
2034
  whisper_mel & mel) {
2035
 
2036
  // Hanning window
@@ -2044,7 +2045,7 @@ static bool log_mel_spectrogram(
2044
  mel.n_len = (n_samples)/fft_step;
2045
  mel.data.resize(mel.n_mel*mel.n_len);
2046
 
2047
- const int n_fft = 1 + fft_size/2;
2048
 
2049
  //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
2050
  //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
@@ -2091,6 +2092,13 @@ static bool log_mel_spectrogram(
2091
  //}
2092
  }
2093
 
 
 
 
 
 
 
 
2094
  // mel spectrogram
2095
  for (int j = 0; j < mel.n_mel; j++) {
2096
  double sum = 0.0;
@@ -2171,7 +2179,21 @@ void whisper_free(struct whisper_context * ctx) {
2171
  int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
2172
  const int64_t t_start_us = ggml_time_us();
2173
 
2174
- if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, ctx->mel)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2175
  fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
2176
  return -1;
2177
  }
@@ -2353,6 +2375,8 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2353
  /*.thold_ptsum =*/ 0.01f,
2354
  /*.max_len =*/ 0,
2355
 
 
 
2356
  /*.language =*/ "en",
2357
 
2358
  /*.greedy =*/ {
@@ -2391,6 +2415,8 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2391
  /*.thold_ptsum =*/ 0.01f,
2392
  /*.max_len =*/ 0,
2393
 
 
 
2394
  /*.language =*/ "en",
2395
 
2396
  /*.greedy =*/ {
@@ -2485,9 +2511,16 @@ int whisper_full(
2485
  result_all.clear();
2486
 
2487
  // compute log mel spectrogram
2488
- if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
2489
- fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
2490
- return -1;
 
 
 
 
 
 
 
2491
  }
2492
 
2493
  if (params.token_timestamps) {
@@ -2673,16 +2706,19 @@ int whisper_full(
2673
  if (tokens_cur[i].id > whisper_token_beg(ctx)) {
2674
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
2675
  if (!text.empty()) {
 
 
 
2676
  if (params.print_realtime) {
2677
  if (params.print_timestamps) {
2678
- printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text.c_str());
2679
  } else {
2680
  printf("%s", text.c_str());
2681
  fflush(stdout);
2682
  }
2683
  }
2684
 
2685
- result_all.push_back({ t0, t1, text, {} });
2686
  for (int j = i0; j <= i; j++) {
2687
  result_all.back().tokens.push_back(tokens_cur[j]);
2688
  }
@@ -2714,16 +2750,19 @@ int whisper_full(
2714
  if (!text.empty()) {
2715
  const auto t1 = seek + seek_delta;
2716
 
 
 
 
2717
  if (params.print_realtime) {
2718
  if (params.print_timestamps) {
2719
- printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text.c_str());
2720
  } else {
2721
  printf("%s", text.c_str());
2722
  fflush(stdout);
2723
  }
2724
  }
2725
 
2726
- result_all.push_back({ t0, t1, text, {} });
2727
  for (int j = i0; j < (int) tokens_cur.size(); j++) {
2728
  result_all.back().tokens.push_back(tokens_cur[j]);
2729
  }
 
2031
  const int n_mel,
2032
  const int n_threads,
2033
  const whisper_filters & filters,
2034
+ const bool speed_up,
2035
  whisper_mel & mel) {
2036
 
2037
  // Hanning window
 
2045
  mel.n_len = (n_samples)/fft_step;
2046
  mel.data.resize(mel.n_mel*mel.n_len);
2047
 
2048
+ const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2);
2049
 
2050
  //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
2051
  //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
 
2092
  //}
2093
  }
2094
 
2095
+ if (speed_up) {
2096
+ // scale down in the frequency domain results in a speed up in the time domain
2097
+ for (int j = 0; j < n_fft; j++) {
2098
+ fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]);
2099
+ }
2100
+ }
2101
+
2102
  // mel spectrogram
2103
  for (int j = 0; j < mel.n_mel; j++) {
2104
  double sum = 0.0;
 
2179
  int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
2180
  const int64_t t_start_us = ggml_time_us();
2181
 
2182
+ if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, ctx->mel)) {
2183
+ fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
2184
+ return -1;
2185
+ }
2186
+
2187
+ ctx->t_mel_us = ggml_time_us() - t_start_us;
2188
+
2189
+ return 0;
2190
+ }
2191
+
2192
+ // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
2193
+ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
2194
+ const int64_t t_start_us = ggml_time_us();
2195
+
2196
+ if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, 2*WHISPER_N_FFT, 2*WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, ctx->mel)) {
2197
  fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
2198
  return -1;
2199
  }
 
2375
  /*.thold_ptsum =*/ 0.01f,
2376
  /*.max_len =*/ 0,
2377
 
2378
+ /*.speed_up =*/ false,
2379
+
2380
  /*.language =*/ "en",
2381
 
2382
  /*.greedy =*/ {
 
2415
  /*.thold_ptsum =*/ 0.01f,
2416
  /*.max_len =*/ 0,
2417
 
2418
+ /*.speed_up =*/ false,
2419
+
2420
  /*.language =*/ "en",
2421
 
2422
  /*.greedy =*/ {
 
2511
  result_all.clear();
2512
 
2513
  // compute log mel spectrogram
2514
+ if (params.speed_up) {
2515
+ if (whisper_pcm_to_mel_phase_vocoder(ctx, samples, n_samples, params.n_threads) != 0) {
2516
+ fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
2517
+ return -1;
2518
+ }
2519
+ } else {
2520
+ if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
2521
+ fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
2522
+ return -1;
2523
+ }
2524
  }
2525
 
2526
  if (params.token_timestamps) {
 
2706
  if (tokens_cur[i].id > whisper_token_beg(ctx)) {
2707
  const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
2708
  if (!text.empty()) {
2709
+ const auto tt0 = params.speed_up ? 2*t0 : t0;
2710
+ const auto tt1 = params.speed_up ? 2*t1 : t1;
2711
+
2712
  if (params.print_realtime) {
2713
  if (params.print_timestamps) {
2714
+ printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
2715
  } else {
2716
  printf("%s", text.c_str());
2717
  fflush(stdout);
2718
  }
2719
  }
2720
 
2721
+ result_all.push_back({ tt0, tt1, text, {} });
2722
  for (int j = i0; j <= i; j++) {
2723
  result_all.back().tokens.push_back(tokens_cur[j]);
2724
  }
 
2750
  if (!text.empty()) {
2751
  const auto t1 = seek + seek_delta;
2752
 
2753
+ const auto tt0 = params.speed_up ? 2*t0 : t0;
2754
+ const auto tt1 = params.speed_up ? 2*t1 : t1;
2755
+
2756
  if (params.print_realtime) {
2757
  if (params.print_timestamps) {
2758
+ printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
2759
  } else {
2760
  printf("%s", text.c_str());
2761
  fflush(stdout);
2762
  }
2763
  }
2764
 
2765
+ result_all.push_back({ tt0, tt1, text, {} });
2766
  for (int j = i0; j < (int) tokens_cur.size(); j++) {
2767
  result_all.back().tokens.push_back(tokens_cur[j]);
2768
  }
whisper.h CHANGED
@@ -202,6 +202,9 @@ extern "C" {
202
  float thold_ptsum; // timestamp token sum probability threshold (~0.01)
203
  int max_len; // max segment length in characters
204
 
 
 
 
205
  const char * language;
206
 
207
  struct {
 
202
  float thold_ptsum; // timestamp token sum probability threshold (~0.01)
203
  int max_len; // max segment length in characters
204
 
205
+ // [EXPERIMENTAL] speed-up techniques
206
+ bool speed_up; // speed-up the audio by 2x using Phase Vocoder
207
+
208
  const char * language;
209
 
210
  struct {