Spaces:
Sleeping
Sleeping
whisper : add option to speed up the audio tempo by x2
Browse filesUsing a Phase Vocoder for speeding up the audio tempo by scaling down
the frequencies in the frequency domain.
This reduces the computation in the Encoder by a factor of 2.
The transcription accuracy is degraded, but for slow to normal speech -
it seems to be still very good.
I think this can find application for real-time transcription - i.e. the
"stream" example.
- examples/main/main.cpp +7 -1
- examples/stream/stream.cpp +6 -0
- whisper.cpp +48 -9
- whisper.h +3 -0
examples/main/main.cpp
CHANGED
|
@@ -59,6 +59,7 @@ struct whisper_params {
|
|
| 59 |
|
| 60 |
float word_thold = 0.01f;
|
| 61 |
|
|
|
|
| 62 |
bool verbose = false;
|
| 63 |
bool translate = false;
|
| 64 |
bool output_txt = false;
|
|
@@ -104,6 +105,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 104 |
params.max_len = std::stoi(argv[++i]);
|
| 105 |
} else if (arg == "-wt" || arg == "--word-thold") {
|
| 106 |
params.word_thold = std::stof(argv[++i]);
|
|
|
|
|
|
|
| 107 |
} else if (arg == "-v" || arg == "--verbose") {
|
| 108 |
params.verbose = true;
|
| 109 |
} else if (arg == "--translate") {
|
|
@@ -161,6 +164,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
| 161 |
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
|
| 162 |
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
|
| 163 |
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
|
|
|
|
| 164 |
fprintf(stderr, " -v, --verbose verbose output\n");
|
| 165 |
fprintf(stderr, " --translate translate from source language to english\n");
|
| 166 |
fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
|
|
@@ -454,7 +458,7 @@ int main(int argc, char ** argv) {
|
|
| 454 |
std::vector<float> pcmf32;
|
| 455 |
{
|
| 456 |
drwav wav;
|
| 457 |
-
|
| 458 |
if (fname_inp == "-") {
|
| 459 |
std::vector<uint8_t> wav_data;
|
| 460 |
{
|
|
@@ -563,6 +567,8 @@ int main(int argc, char ** argv) {
|
|
| 563 |
wparams.thold_pt = params.word_thold;
|
| 564 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 565 |
|
|
|
|
|
|
|
| 566 |
// this callback is called on each new segment
|
| 567 |
if (!wparams.print_realtime) {
|
| 568 |
wparams.new_segment_callback = whisper_print_segment_callback;
|
|
|
|
| 59 |
|
| 60 |
float word_thold = 0.01f;
|
| 61 |
|
| 62 |
+
bool speed_up = false;
|
| 63 |
bool verbose = false;
|
| 64 |
bool translate = false;
|
| 65 |
bool output_txt = false;
|
|
|
|
| 105 |
params.max_len = std::stoi(argv[++i]);
|
| 106 |
} else if (arg == "-wt" || arg == "--word-thold") {
|
| 107 |
params.word_thold = std::stof(argv[++i]);
|
| 108 |
+
} else if (arg == "-su" || arg == "--speed-up") {
|
| 109 |
+
params.speed_up = true;
|
| 110 |
} else if (arg == "-v" || arg == "--verbose") {
|
| 111 |
params.verbose = true;
|
| 112 |
} else if (arg == "--translate") {
|
|
|
|
| 164 |
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
|
| 165 |
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
|
| 166 |
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
|
| 167 |
+
fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
|
| 168 |
fprintf(stderr, " -v, --verbose verbose output\n");
|
| 169 |
fprintf(stderr, " --translate translate from source language to english\n");
|
| 170 |
fprintf(stderr, " -otxt, --output-txt output result in a text file\n");
|
|
|
|
| 458 |
std::vector<float> pcmf32;
|
| 459 |
{
|
| 460 |
drwav wav;
|
| 461 |
+
|
| 462 |
if (fname_inp == "-") {
|
| 463 |
std::vector<uint8_t> wav_data;
|
| 464 |
{
|
|
|
|
| 567 |
wparams.thold_pt = params.word_thold;
|
| 568 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 569 |
|
| 570 |
+
wparams.speed_up = params.speed_up;
|
| 571 |
+
|
| 572 |
// this callback is called on each new segment
|
| 573 |
if (!wparams.print_realtime) {
|
| 574 |
wparams.new_segment_callback = whisper_print_segment_callback;
|
examples/stream/stream.cpp
CHANGED
|
@@ -41,6 +41,7 @@ struct whisper_params {
|
|
| 41 |
int32_t length_ms = 10000;
|
| 42 |
int32_t capture_id = -1;
|
| 43 |
|
|
|
|
| 44 |
bool verbose = false;
|
| 45 |
bool translate = false;
|
| 46 |
bool no_context = true;
|
|
@@ -68,6 +69,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 68 |
params.length_ms = std::stoi(argv[++i]);
|
| 69 |
} else if (arg == "-c" || arg == "--capture") {
|
| 70 |
params.capture_id = std::stoi(argv[++i]);
|
|
|
|
|
|
|
| 71 |
} else if (arg == "-v" || arg == "--verbose") {
|
| 72 |
params.verbose = true;
|
| 73 |
} else if (arg == "--translate") {
|
|
@@ -113,6 +116,7 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
| 113 |
fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
|
| 114 |
fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
|
| 115 |
fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
|
|
|
|
| 116 |
fprintf(stderr, " -v, --verbose verbose output\n");
|
| 117 |
fprintf(stderr, " --translate translate from source language to english\n");
|
| 118 |
fprintf(stderr, " -kc, --keep-context keep text context from earlier audio (default: false)\n");
|
|
@@ -326,6 +330,8 @@ int main(int argc, char ** argv) {
|
|
| 326 |
wparams.language = params.language.c_str();
|
| 327 |
wparams.n_threads = params.n_threads;
|
| 328 |
|
|
|
|
|
|
|
| 329 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 330 |
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
| 331 |
return 6;
|
|
|
|
| 41 |
int32_t length_ms = 10000;
|
| 42 |
int32_t capture_id = -1;
|
| 43 |
|
| 44 |
+
bool speed_up = false;
|
| 45 |
bool verbose = false;
|
| 46 |
bool translate = false;
|
| 47 |
bool no_context = true;
|
|
|
|
| 69 |
params.length_ms = std::stoi(argv[++i]);
|
| 70 |
} else if (arg == "-c" || arg == "--capture") {
|
| 71 |
params.capture_id = std::stoi(argv[++i]);
|
| 72 |
+
} else if (arg == "-su" || arg == "--speed-up") {
|
| 73 |
+
params.speed_up = true;
|
| 74 |
} else if (arg == "-v" || arg == "--verbose") {
|
| 75 |
params.verbose = true;
|
| 76 |
} else if (arg == "--translate") {
|
|
|
|
| 116 |
fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
|
| 117 |
fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
|
| 118 |
fprintf(stderr, " -c ID, --capture ID capture device ID (default: -1)\n");
|
| 119 |
+
fprintf(stderr, " -su, --speed-up speed up audio by factor of 2 (faster processing, reduced accuracy, default: %s)\n", params.speed_up ? "true" : "false");
|
| 120 |
fprintf(stderr, " -v, --verbose verbose output\n");
|
| 121 |
fprintf(stderr, " --translate translate from source language to english\n");
|
| 122 |
fprintf(stderr, " -kc, --keep-context keep text context from earlier audio (default: false)\n");
|
|
|
|
| 330 |
wparams.language = params.language.c_str();
|
| 331 |
wparams.n_threads = params.n_threads;
|
| 332 |
|
| 333 |
+
wparams.speed_up = params.speed_up;
|
| 334 |
+
|
| 335 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 336 |
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
| 337 |
return 6;
|
whisper.cpp
CHANGED
|
@@ -2031,6 +2031,7 @@ static bool log_mel_spectrogram(
|
|
| 2031 |
const int n_mel,
|
| 2032 |
const int n_threads,
|
| 2033 |
const whisper_filters & filters,
|
|
|
|
| 2034 |
whisper_mel & mel) {
|
| 2035 |
|
| 2036 |
// Hanning window
|
|
@@ -2044,7 +2045,7 @@ static bool log_mel_spectrogram(
|
|
| 2044 |
mel.n_len = (n_samples)/fft_step;
|
| 2045 |
mel.data.resize(mel.n_mel*mel.n_len);
|
| 2046 |
|
| 2047 |
-
const int n_fft = 1 + fft_size/2;
|
| 2048 |
|
| 2049 |
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
|
| 2050 |
//printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
|
|
@@ -2091,6 +2092,13 @@ static bool log_mel_spectrogram(
|
|
| 2091 |
//}
|
| 2092 |
}
|
| 2093 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2094 |
// mel spectrogram
|
| 2095 |
for (int j = 0; j < mel.n_mel; j++) {
|
| 2096 |
double sum = 0.0;
|
|
@@ -2171,7 +2179,21 @@ void whisper_free(struct whisper_context * ctx) {
|
|
| 2171 |
int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
|
| 2172 |
const int64_t t_start_us = ggml_time_us();
|
| 2173 |
|
| 2174 |
-
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, ctx->mel)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2175 |
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
|
| 2176 |
return -1;
|
| 2177 |
}
|
|
@@ -2353,6 +2375,8 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2353 |
/*.thold_ptsum =*/ 0.01f,
|
| 2354 |
/*.max_len =*/ 0,
|
| 2355 |
|
|
|
|
|
|
|
| 2356 |
/*.language =*/ "en",
|
| 2357 |
|
| 2358 |
/*.greedy =*/ {
|
|
@@ -2391,6 +2415,8 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2391 |
/*.thold_ptsum =*/ 0.01f,
|
| 2392 |
/*.max_len =*/ 0,
|
| 2393 |
|
|
|
|
|
|
|
| 2394 |
/*.language =*/ "en",
|
| 2395 |
|
| 2396 |
/*.greedy =*/ {
|
|
@@ -2485,9 +2511,16 @@ int whisper_full(
|
|
| 2485 |
result_all.clear();
|
| 2486 |
|
| 2487 |
// compute log mel spectrogram
|
| 2488 |
-
if (
|
| 2489 |
-
|
| 2490 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2491 |
}
|
| 2492 |
|
| 2493 |
if (params.token_timestamps) {
|
|
@@ -2673,16 +2706,19 @@ int whisper_full(
|
|
| 2673 |
if (tokens_cur[i].id > whisper_token_beg(ctx)) {
|
| 2674 |
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
| 2675 |
if (!text.empty()) {
|
|
|
|
|
|
|
|
|
|
| 2676 |
if (params.print_realtime) {
|
| 2677 |
if (params.print_timestamps) {
|
| 2678 |
-
printf("[%s --> %s] %s\n", to_timestamp(
|
| 2679 |
} else {
|
| 2680 |
printf("%s", text.c_str());
|
| 2681 |
fflush(stdout);
|
| 2682 |
}
|
| 2683 |
}
|
| 2684 |
|
| 2685 |
-
result_all.push_back({
|
| 2686 |
for (int j = i0; j <= i; j++) {
|
| 2687 |
result_all.back().tokens.push_back(tokens_cur[j]);
|
| 2688 |
}
|
|
@@ -2714,16 +2750,19 @@ int whisper_full(
|
|
| 2714 |
if (!text.empty()) {
|
| 2715 |
const auto t1 = seek + seek_delta;
|
| 2716 |
|
|
|
|
|
|
|
|
|
|
| 2717 |
if (params.print_realtime) {
|
| 2718 |
if (params.print_timestamps) {
|
| 2719 |
-
printf("[%s --> %s] %s\n", to_timestamp(
|
| 2720 |
} else {
|
| 2721 |
printf("%s", text.c_str());
|
| 2722 |
fflush(stdout);
|
| 2723 |
}
|
| 2724 |
}
|
| 2725 |
|
| 2726 |
-
result_all.push_back({
|
| 2727 |
for (int j = i0; j < (int) tokens_cur.size(); j++) {
|
| 2728 |
result_all.back().tokens.push_back(tokens_cur[j]);
|
| 2729 |
}
|
|
|
|
| 2031 |
const int n_mel,
|
| 2032 |
const int n_threads,
|
| 2033 |
const whisper_filters & filters,
|
| 2034 |
+
const bool speed_up,
|
| 2035 |
whisper_mel & mel) {
|
| 2036 |
|
| 2037 |
// Hanning window
|
|
|
|
| 2045 |
mel.n_len = (n_samples)/fft_step;
|
| 2046 |
mel.data.resize(mel.n_mel*mel.n_len);
|
| 2047 |
|
| 2048 |
+
const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2);
|
| 2049 |
|
| 2050 |
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
|
| 2051 |
//printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
|
|
|
|
| 2092 |
//}
|
| 2093 |
}
|
| 2094 |
|
| 2095 |
+
if (speed_up) {
|
| 2096 |
+
// scale down in the frequency domain results in a speed up in the time domain
|
| 2097 |
+
for (int j = 0; j < n_fft; j++) {
|
| 2098 |
+
fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]);
|
| 2099 |
+
}
|
| 2100 |
+
}
|
| 2101 |
+
|
| 2102 |
// mel spectrogram
|
| 2103 |
for (int j = 0; j < mel.n_mel; j++) {
|
| 2104 |
double sum = 0.0;
|
|
|
|
| 2179 |
int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
|
| 2180 |
const int64_t t_start_us = ggml_time_us();
|
| 2181 |
|
| 2182 |
+
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, ctx->mel)) {
|
| 2183 |
+
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
|
| 2184 |
+
return -1;
|
| 2185 |
+
}
|
| 2186 |
+
|
| 2187 |
+
ctx->t_mel_us = ggml_time_us() - t_start_us;
|
| 2188 |
+
|
| 2189 |
+
return 0;
|
| 2190 |
+
}
|
| 2191 |
+
|
| 2192 |
+
// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
|
| 2193 |
+
int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
|
| 2194 |
+
const int64_t t_start_us = ggml_time_us();
|
| 2195 |
+
|
| 2196 |
+
if (!log_mel_spectrogram(samples, n_samples, WHISPER_SAMPLE_RATE, 2*WHISPER_N_FFT, 2*WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, ctx->mel)) {
|
| 2197 |
fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
|
| 2198 |
return -1;
|
| 2199 |
}
|
|
|
|
| 2375 |
/*.thold_ptsum =*/ 0.01f,
|
| 2376 |
/*.max_len =*/ 0,
|
| 2377 |
|
| 2378 |
+
/*.speed_up =*/ false,
|
| 2379 |
+
|
| 2380 |
/*.language =*/ "en",
|
| 2381 |
|
| 2382 |
/*.greedy =*/ {
|
|
|
|
| 2415 |
/*.thold_ptsum =*/ 0.01f,
|
| 2416 |
/*.max_len =*/ 0,
|
| 2417 |
|
| 2418 |
+
/*.speed_up =*/ false,
|
| 2419 |
+
|
| 2420 |
/*.language =*/ "en",
|
| 2421 |
|
| 2422 |
/*.greedy =*/ {
|
|
|
|
| 2511 |
result_all.clear();
|
| 2512 |
|
| 2513 |
// compute log mel spectrogram
|
| 2514 |
+
if (params.speed_up) {
|
| 2515 |
+
if (whisper_pcm_to_mel_phase_vocoder(ctx, samples, n_samples, params.n_threads) != 0) {
|
| 2516 |
+
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
|
| 2517 |
+
return -1;
|
| 2518 |
+
}
|
| 2519 |
+
} else {
|
| 2520 |
+
if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
|
| 2521 |
+
fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
|
| 2522 |
+
return -1;
|
| 2523 |
+
}
|
| 2524 |
}
|
| 2525 |
|
| 2526 |
if (params.token_timestamps) {
|
|
|
|
| 2706 |
if (tokens_cur[i].id > whisper_token_beg(ctx)) {
|
| 2707 |
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
| 2708 |
if (!text.empty()) {
|
| 2709 |
+
const auto tt0 = params.speed_up ? 2*t0 : t0;
|
| 2710 |
+
const auto tt1 = params.speed_up ? 2*t1 : t1;
|
| 2711 |
+
|
| 2712 |
if (params.print_realtime) {
|
| 2713 |
if (params.print_timestamps) {
|
| 2714 |
+
printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
|
| 2715 |
} else {
|
| 2716 |
printf("%s", text.c_str());
|
| 2717 |
fflush(stdout);
|
| 2718 |
}
|
| 2719 |
}
|
| 2720 |
|
| 2721 |
+
result_all.push_back({ tt0, tt1, text, {} });
|
| 2722 |
for (int j = i0; j <= i; j++) {
|
| 2723 |
result_all.back().tokens.push_back(tokens_cur[j]);
|
| 2724 |
}
|
|
|
|
| 2750 |
if (!text.empty()) {
|
| 2751 |
const auto t1 = seek + seek_delta;
|
| 2752 |
|
| 2753 |
+
const auto tt0 = params.speed_up ? 2*t0 : t0;
|
| 2754 |
+
const auto tt1 = params.speed_up ? 2*t1 : t1;
|
| 2755 |
+
|
| 2756 |
if (params.print_realtime) {
|
| 2757 |
if (params.print_timestamps) {
|
| 2758 |
+
printf("[%s --> %s] %s\n", to_timestamp(tt0).c_str(), to_timestamp(tt1).c_str(), text.c_str());
|
| 2759 |
} else {
|
| 2760 |
printf("%s", text.c_str());
|
| 2761 |
fflush(stdout);
|
| 2762 |
}
|
| 2763 |
}
|
| 2764 |
|
| 2765 |
+
result_all.push_back({ tt0, tt1, text, {} });
|
| 2766 |
for (int j = i0; j < (int) tokens_cur.size(); j++) {
|
| 2767 |
result_all.back().tokens.push_back(tokens_cur[j]);
|
| 2768 |
}
|
whisper.h
CHANGED
|
@@ -202,6 +202,9 @@ extern "C" {
|
|
| 202 |
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
|
| 203 |
int max_len; // max segment length in characters
|
| 204 |
|
|
|
|
|
|
|
|
|
|
| 205 |
const char * language;
|
| 206 |
|
| 207 |
struct {
|
|
|
|
| 202 |
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
|
| 203 |
int max_len; // max segment length in characters
|
| 204 |
|
| 205 |
+
// [EXPERIMENTAL] speed-up techniques
|
| 206 |
+
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
| 207 |
+
|
| 208 |
const char * language;
|
| 209 |
|
| 210 |
struct {
|