Spaces:
Running
Running
whisper : pad audio instead of spectrogram (#579)
Browse filesAlso, fallback only if more temperatures are available and if we are
at least 3 seconds before the end of the audio
- whisper.cpp +37 -9
whisper.cpp
CHANGED
|
@@ -297,6 +297,7 @@ static const std::map<e_model, size_t> MEM_REQ_DECODE = {
|
|
| 297 |
|
| 298 |
struct whisper_mel {
|
| 299 |
int n_len;
|
|
|
|
| 300 |
int n_mel;
|
| 301 |
|
| 302 |
std::vector<float> data;
|
|
@@ -2388,8 +2389,28 @@ static bool log_mel_spectrogram(
|
|
| 2388 |
hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
|
| 2389 |
}
|
| 2390 |
|
| 2391 |
-
mel.n_mel
|
| 2392 |
-
mel.n_len
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2393 |
mel.data.resize(mel.n_mel*mel.n_len);
|
| 2394 |
|
| 2395 |
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
|
|
@@ -2433,6 +2454,8 @@ static bool log_mel_spectrogram(
|
|
| 2433 |
|
| 2434 |
wstate.t_mel_us += ggml_time_us() - t_start_us;
|
| 2435 |
|
|
|
|
|
|
|
| 2436 |
return true;
|
| 2437 |
}
|
| 2438 |
|
|
@@ -2786,8 +2809,9 @@ int whisper_set_mel_with_state(
|
|
| 2786 |
return -1;
|
| 2787 |
}
|
| 2788 |
|
| 2789 |
-
state->mel.n_len
|
| 2790 |
-
state->mel.
|
|
|
|
| 2791 |
|
| 2792 |
state->mel.data.resize(n_len*n_mel);
|
| 2793 |
memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
|
|
@@ -2913,8 +2937,8 @@ int whisper_lang_auto_detect_with_state(
|
|
| 2913 |
return -1;
|
| 2914 |
}
|
| 2915 |
|
| 2916 |
-
if (seek >= state->mel.
|
| 2917 |
-
fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.
|
| 2918 |
return -2;
|
| 2919 |
}
|
| 2920 |
|
|
@@ -3049,11 +3073,11 @@ const char *whisper_model_type_readable(struct whisper_context * ctx) {
|
|
| 3049 |
}
|
| 3050 |
|
| 3051 |
int whisper_n_len_from_state(struct whisper_state * state) {
|
| 3052 |
-
return state->mel.
|
| 3053 |
}
|
| 3054 |
|
| 3055 |
int whisper_n_len(struct whisper_context * ctx) {
|
| 3056 |
-
return ctx->state->mel.
|
| 3057 |
}
|
| 3058 |
|
| 3059 |
int whisper_n_vocab(struct whisper_context * ctx) {
|
|
@@ -4354,7 +4378,11 @@ int whisper_full_with_state(
|
|
| 4354 |
}
|
| 4355 |
|
| 4356 |
// was the decoding successful for the current temperature?
|
| 4357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4358 |
bool success = true;
|
| 4359 |
|
| 4360 |
const auto & decoder = state->decoders[best_decoder_id];
|
|
|
|
| 297 |
|
| 298 |
struct whisper_mel {
|
| 299 |
int n_len;
|
| 300 |
+
int n_len_org;
|
| 301 |
int n_mel;
|
| 302 |
|
| 303 |
std::vector<float> data;
|
|
|
|
| 2389 |
hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
|
| 2390 |
}
|
| 2391 |
|
| 2392 |
+
mel.n_mel = n_mel;
|
| 2393 |
+
mel.n_len = n_samples/fft_step;
|
| 2394 |
+
mel.n_len_org = mel.n_len;
|
| 2395 |
+
|
| 2396 |
+
std::vector<float> samples_padded;
|
| 2397 |
+
|
| 2398 |
+
// pad audio with at least one extra chunk of zeros
|
| 2399 |
+
{
|
| 2400 |
+
const int pad = (100*WHISPER_CHUNK_SIZE)/2;
|
| 2401 |
+
|
| 2402 |
+
if (mel.n_len % pad != 0) {
|
| 2403 |
+
mel.n_len = (mel.n_len/pad + 1)*pad;
|
| 2404 |
+
}
|
| 2405 |
+
mel.n_len += pad;
|
| 2406 |
+
|
| 2407 |
+
samples_padded.resize(mel.n_len*fft_step);
|
| 2408 |
+
memcpy(samples_padded.data(), samples, n_samples*sizeof(float));
|
| 2409 |
+
memset(samples_padded.data() + n_samples, 0, (mel.n_len*fft_step - n_samples)*sizeof(float));
|
| 2410 |
+
|
| 2411 |
+
samples = samples_padded.data();
|
| 2412 |
+
}
|
| 2413 |
+
|
| 2414 |
mel.data.resize(mel.n_mel*mel.n_len);
|
| 2415 |
|
| 2416 |
//printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
|
|
|
|
| 2454 |
|
| 2455 |
wstate.t_mel_us += ggml_time_us() - t_start_us;
|
| 2456 |
|
| 2457 |
+
//printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
|
| 2458 |
+
|
| 2459 |
return true;
|
| 2460 |
}
|
| 2461 |
|
|
|
|
| 2809 |
return -1;
|
| 2810 |
}
|
| 2811 |
|
| 2812 |
+
state->mel.n_len = n_len;
|
| 2813 |
+
state->mel.n_len_org = n_len;
|
| 2814 |
+
state->mel.n_mel = n_mel;
|
| 2815 |
|
| 2816 |
state->mel.data.resize(n_len*n_mel);
|
| 2817 |
memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
|
|
|
|
| 2937 |
return -1;
|
| 2938 |
}
|
| 2939 |
|
| 2940 |
+
if (seek >= state->mel.n_len_org) {
|
| 2941 |
+
fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
|
| 2942 |
return -2;
|
| 2943 |
}
|
| 2944 |
|
|
|
|
| 3073 |
}
|
| 3074 |
|
| 3075 |
int whisper_n_len_from_state(struct whisper_state * state) {
|
| 3076 |
+
return state->mel.n_len_org;
|
| 3077 |
}
|
| 3078 |
|
| 3079 |
int whisper_n_len(struct whisper_context * ctx) {
|
| 3080 |
+
return ctx->state->mel.n_len_org;
|
| 3081 |
}
|
| 3082 |
|
| 3083 |
int whisper_n_vocab(struct whisper_context * ctx) {
|
|
|
|
| 4378 |
}
|
| 4379 |
|
| 4380 |
// was the decoding successful for the current temperature?
|
| 4381 |
+
// do fallback only if:
|
| 4382 |
+
// - we are not at the last temperature
|
| 4383 |
+
// - we are not at the end of the audio (3 sec)
|
| 4384 |
+
if (it != (int) temperatures.size() - 1 &&
|
| 4385 |
+
seek_end - seek > 10*WHISPER_CHUNK_SIZE) {
|
| 4386 |
bool success = true;
|
| 4387 |
|
| 4388 |
const auto & decoder = state->decoders[best_decoder_id];
|