ggerganov commited on
Commit
8f647a2
·
unverified ·
1 Parent(s): e02ade6

whisper : pad audio instead of spectrogram (#579)

Browse files

Also, fallback only if more temperatures are available and if we are
at least 3 seconds before the end of the audio

Files changed (1) hide show
  1. whisper.cpp +37 -9
whisper.cpp CHANGED
@@ -297,6 +297,7 @@ static const std::map<e_model, size_t> MEM_REQ_DECODE = {
297
 
298
  struct whisper_mel {
299
  int n_len;
 
300
  int n_mel;
301
 
302
  std::vector<float> data;
@@ -2388,8 +2389,28 @@ static bool log_mel_spectrogram(
2388
  hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
2389
  }
2390
 
2391
- mel.n_mel = n_mel;
2392
- mel.n_len = (n_samples)/fft_step;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2393
  mel.data.resize(mel.n_mel*mel.n_len);
2394
 
2395
  //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
@@ -2433,6 +2454,8 @@ static bool log_mel_spectrogram(
2433
 
2434
  wstate.t_mel_us += ggml_time_us() - t_start_us;
2435
 
 
 
2436
  return true;
2437
  }
2438
 
@@ -2786,8 +2809,9 @@ int whisper_set_mel_with_state(
2786
  return -1;
2787
  }
2788
 
2789
- state->mel.n_len = n_len;
2790
- state->mel.n_mel = n_mel;
 
2791
 
2792
  state->mel.data.resize(n_len*n_mel);
2793
  memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
@@ -2913,8 +2937,8 @@ int whisper_lang_auto_detect_with_state(
2913
  return -1;
2914
  }
2915
 
2916
- if (seek >= state->mel.n_len) {
2917
- fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len*10);
2918
  return -2;
2919
  }
2920
 
@@ -3049,11 +3073,11 @@ const char *whisper_model_type_readable(struct whisper_context * ctx) {
3049
  }
3050
 
3051
  int whisper_n_len_from_state(struct whisper_state * state) {
3052
- return state->mel.n_len;
3053
  }
3054
 
3055
  int whisper_n_len(struct whisper_context * ctx) {
3056
- return ctx->state->mel.n_len;
3057
  }
3058
 
3059
  int whisper_n_vocab(struct whisper_context * ctx) {
@@ -4354,7 +4378,11 @@ int whisper_full_with_state(
4354
  }
4355
 
4356
  // was the decoding successful for the current temperature?
4357
- {
 
 
 
 
4358
  bool success = true;
4359
 
4360
  const auto & decoder = state->decoders[best_decoder_id];
 
297
 
298
  struct whisper_mel {
299
  int n_len;
300
+ int n_len_org;
301
  int n_mel;
302
 
303
  std::vector<float> data;
 
2389
  hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
2390
  }
2391
 
2392
+ mel.n_mel = n_mel;
2393
+ mel.n_len = n_samples/fft_step;
2394
+ mel.n_len_org = mel.n_len;
2395
+
2396
+ std::vector<float> samples_padded;
2397
+
2398
+ // pad audio with at least one extra chunk of zeros
2399
+ {
2400
+ const int pad = (100*WHISPER_CHUNK_SIZE)/2;
2401
+
2402
+ if (mel.n_len % pad != 0) {
2403
+ mel.n_len = (mel.n_len/pad + 1)*pad;
2404
+ }
2405
+ mel.n_len += pad;
2406
+
2407
+ samples_padded.resize(mel.n_len*fft_step);
2408
+ memcpy(samples_padded.data(), samples, n_samples*sizeof(float));
2409
+ memset(samples_padded.data() + n_samples, 0, (mel.n_len*fft_step - n_samples)*sizeof(float));
2410
+
2411
+ samples = samples_padded.data();
2412
+ }
2413
+
2414
  mel.data.resize(mel.n_mel*mel.n_len);
2415
 
2416
  //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
 
2454
 
2455
  wstate.t_mel_us += ggml_time_us() - t_start_us;
2456
 
2457
+ //printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
2458
+
2459
  return true;
2460
  }
2461
 
 
2809
  return -1;
2810
  }
2811
 
2812
+ state->mel.n_len = n_len;
2813
+ state->mel.n_len_org = n_len;
2814
+ state->mel.n_mel = n_mel;
2815
 
2816
  state->mel.data.resize(n_len*n_mel);
2817
  memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
 
2937
  return -1;
2938
  }
2939
 
2940
+ if (seek >= state->mel.n_len_org) {
2941
+ fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
2942
  return -2;
2943
  }
2944
 
 
3073
  }
3074
 
3075
  int whisper_n_len_from_state(struct whisper_state * state) {
3076
+ return state->mel.n_len_org;
3077
  }
3078
 
3079
  int whisper_n_len(struct whisper_context * ctx) {
3080
+ return ctx->state->mel.n_len_org;
3081
  }
3082
 
3083
  int whisper_n_vocab(struct whisper_context * ctx) {
 
4378
  }
4379
 
4380
  // was the decoding successful for the current temperature?
4381
+ // do fallback only if:
4382
+ // - we are not at the last temperature
4383
+ // - we are not at the end of the audio (3 sec)
4384
+ if (it != (int) temperatures.size() - 1 &&
4385
+ seek_end - seek > 10*WHISPER_CHUNK_SIZE) {
4386
  bool success = true;
4387
 
4388
  const auto & decoder = state->decoders[best_decoder_id];