ggerganov commited on
Commit
8eef240
·
unverified ·
1 Parent(s): d831511

stream : improve real-time transcription

Browse files
Files changed (1) hide show
  1. stream.cpp +27 -6
stream.cpp CHANGED
@@ -37,6 +37,7 @@ struct whisper_params {
37
  int32_t seed = -1; // RNG seed, not used currently
38
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
39
  int32_t step_ms = 3000;
 
40
 
41
  bool verbose = false;
42
  bool translate = false;
@@ -61,6 +62,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
61
  params.n_threads = std::stoi(argv[++i]);
62
  } else if (arg == "--step") {
63
  params.step_ms = std::stoi(argv[++i]);
 
 
64
  } else if (arg == "-v" || arg == "--verbose") {
65
  params.verbose = true;
66
  } else if (arg == "--translate") {
@@ -104,9 +107,10 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
104
  fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
105
  fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
106
  fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
 
107
  fprintf(stderr, " -v, --verbose verbose output\n");
108
  fprintf(stderr, " --translate translate from source language to english\n");
109
- fprintf(stderr, " -nc, --no-context disable context from earlier audio (default: false)\n");
110
  fprintf(stderr, " -ps, --print_special print special tokens\n");
111
  fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
112
  fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
@@ -206,6 +210,7 @@ int main(int argc, char ** argv) {
206
  struct whisper_context * ctx = whisper_init(params.model.c_str());
207
 
208
  const int n_samples = (params.step_ms/1000.0)*WHISPER_SAMPLE_RATE;
 
209
  const int n_samples_30s = 30*WHISPER_SAMPLE_RATE;
210
  std::vector<float> pcmf32(n_samples_30s, 0.0f);
211
  std::vector<float> pcmf32_old;
@@ -220,8 +225,12 @@ int main(int argc, char ** argv) {
220
  printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
221
  }
222
  }
223
- printf("%s: processing %d samples (%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
224
- __func__, n_samples, float(n_samples)/WHISPER_SAMPLE_RATE, params.n_threads,
 
 
 
 
225
  params.language.c_str(),
226
  params.translate ? "translate" : "transcribe",
227
  params.no_timestamps ? 0 : 1);
@@ -230,6 +239,7 @@ int main(int argc, char ** argv) {
230
 
231
  SDL_PauseAudioDevice(g_dev_id_in, 0);
232
 
 
233
  bool is_running = true;
234
 
235
  // main audio loop
@@ -253,8 +263,10 @@ int main(int argc, char ** argv) {
253
  const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float);
254
 
255
  // take one second from previous iteration
256
- // TODO: better strategy
257
- const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
 
 
258
 
259
  //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
260
 
@@ -288,7 +300,9 @@ int main(int argc, char ** argv) {
288
 
289
  // print result;
290
  {
291
- printf("\n");
 
 
292
 
293
  const int n_segments = whisper_full_n_segments(ctx);
294
  for (int i = 0; i < n_segments; ++i) {
@@ -305,6 +319,13 @@ int main(int argc, char ** argv) {
305
  }
306
  }
307
  }
 
 
 
 
 
 
 
308
  }
309
  }
310
 
 
37
  int32_t seed = -1; // RNG seed, not used currently
38
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
39
  int32_t step_ms = 3000;
40
+ int32_t length_ms = 10000;
41
 
42
  bool verbose = false;
43
  bool translate = false;
 
62
  params.n_threads = std::stoi(argv[++i]);
63
  } else if (arg == "--step") {
64
  params.step_ms = std::stoi(argv[++i]);
65
+ } else if (arg == "--length") {
66
+ params.length_ms = std::stoi(argv[++i]);
67
  } else if (arg == "-v" || arg == "--verbose") {
68
  params.verbose = true;
69
  } else if (arg == "--translate") {
 
107
  fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
108
  fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
109
  fprintf(stderr, " --step N audio step size in milliseconds (default: %d)\n", params.step_ms);
110
+ fprintf(stderr, " --length N audio length in milliseconds (default: %d)\n", params.length_ms);
111
  fprintf(stderr, " -v, --verbose verbose output\n");
112
  fprintf(stderr, " --translate translate from source language to english\n");
113
+ fprintf(stderr, " -kc, --keep-context keep text context from earlier audio (default: false)\n");
114
  fprintf(stderr, " -ps, --print_special print special tokens\n");
115
  fprintf(stderr, " -nt, --no_timestamps do not print timestamps\n");
116
  fprintf(stderr, " -l LANG, --language LANG spoken language (default: %s)\n", params.language.c_str());
 
210
  struct whisper_context * ctx = whisper_init(params.model.c_str());
211
 
212
  const int n_samples = (params.step_ms/1000.0)*WHISPER_SAMPLE_RATE;
213
+ const int n_samples_len = (params.length_ms/1000.0)*WHISPER_SAMPLE_RATE;
214
  const int n_samples_30s = 30*WHISPER_SAMPLE_RATE;
215
  std::vector<float> pcmf32(n_samples_30s, 0.0f);
216
  std::vector<float> pcmf32_old;
 
225
  printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
226
  }
227
  }
228
+ printf("%s: processing %d samples (step = %.1f sec / len = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
229
+ __func__,
230
+ n_samples,
231
+ float(n_samples)/WHISPER_SAMPLE_RATE,
232
+ float(n_samples_len)/WHISPER_SAMPLE_RATE,
233
+ params.n_threads,
234
  params.language.c_str(),
235
  params.translate ? "translate" : "transcribe",
236
  params.no_timestamps ? 0 : 1);
 
239
 
240
  SDL_PauseAudioDevice(g_dev_id_in, 0);
241
 
242
+ int n_iter = 0;
243
  bool is_running = true;
244
 
245
  // main audio loop
 
263
  const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float);
264
 
265
  // take one second from previous iteration
266
+ //const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
267
+
268
+ // take up to params.length_ms audio from previous iteration
269
+ const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_len - n_samples_new));
270
 
271
  //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
272
 
 
300
 
301
  // print result;
302
  {
303
+ if ((n_iter % (params.length_ms / params.step_ms - 1)) != 0) {
304
+ printf("\33[2K\r");
305
+ }
306
 
307
  const int n_segments = whisper_full_n_segments(ctx);
308
  for (int i = 0; i < n_segments; ++i) {
 
319
  }
320
  }
321
  }
322
+
323
+ ++n_iter;
324
+ if ((n_iter % (params.length_ms / params.step_ms - 1)) == 0) {
325
+ printf("\n");
326
+
327
+ pcmf32_old.clear();
328
+ }
329
  }
330
  }
331