Lin Xiaodong linxiaodong commited on
Commit
f795870
·
unverified ·
1 Parent(s): 4966aed

feat: support vad for addon.node (#3301)

Browse files

Co-authored-by: linxiaodong <[email protected]>

examples/addon.node/README.md CHANGED
@@ -1,8 +1,10 @@
1
- # addon
2
 
3
  This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
4
  It can be used as a reference for using the whisper.cpp project in other node projects.
5
 
 
 
6
  ## Install
7
 
8
  ```shell
@@ -26,12 +28,88 @@ For Electron addon and cmake-js options, you can see [cmake-js](https://github.c
26
 
27
  ## Run
28
 
 
 
29
  ```shell
30
  cd examples/addon.node
31
 
32
  node index.js --language='language' --model='model-path' --fname_inp='file-path'
33
  ```
34
 
35
- Because this is a simple Demo, only the above parameters are set in the node environment.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- Other parameters can also be specified in the node environment.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # whisper.cpp Node.js addon
2
 
3
  This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
4
  It can be used as a reference for using the whisper.cpp project in other node projects.
5
 
6
+ This addon now supports **Voice Activity Detection (VAD)** for improved transcription performance.
7
+
8
  ## Install
9
 
10
  ```shell
 
28
 
29
  ## Run
30
 
31
+ ### Basic Usage
32
+
33
  ```shell
34
  cd examples/addon.node
35
 
36
  node index.js --language='language' --model='model-path' --fname_inp='file-path'
37
  ```
38
 
39
+ ### VAD (Voice Activity Detection) Usage
40
+
41
+ Run the VAD example with performance comparison:
42
+
43
+ ```shell
44
+ node vad-example.js
45
+ ```
46
+
47
+ ## Voice Activity Detection (VAD) Support
48
+
49
+ VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence.
50
+
51
+ ### VAD Model Setup
52
+
53
+ Before using VAD, download a VAD model:
54
+
55
+ ```shell
56
+ # From the whisper.cpp root directory
57
+ ./models/download-vad-model.sh silero-v5.1.2
58
+ ```
59
+
60
+ ### VAD Parameters
61
+
62
+ All VAD parameters are optional and have sensible defaults:
63
+
64
+ - `vad`: Enable VAD (default: false)
65
+ - `vad_model`: Path to VAD model file (required when VAD enabled)
66
+ - `vad_threshold`: Speech detection threshold 0.0-1.0 (default: 0.5)
67
+ - `vad_min_speech_duration_ms`: Min speech duration in ms (default: 250)
68
+ - `vad_min_silence_duration_ms`: Min silence duration in ms (default: 100)
69
+ - `vad_max_speech_duration_s`: Max speech duration in seconds (default: FLT_MAX)
70
+ - `vad_speech_pad_ms`: Speech padding in ms (default: 30)
71
+ - `vad_samples_overlap`: Sample overlap 0.0-1.0 (default: 0.1)
72
+
73
+ ### JavaScript API Example
74
+
75
+ ```javascript
76
+ const path = require("path");
77
+ const { whisper } = require(path.join(__dirname, "../../build/Release/addon.node"));
78
+ const { promisify } = require("util");
79
+
80
+ const whisperAsync = promisify(whisper);
81
+
82
+ // With VAD enabled
83
+ const vadParams = {
84
+ language: "en",
85
+ model: path.join(__dirname, "../../models/ggml-base.en.bin"),
86
+ fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
87
+ vad: true,
88
+ vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"),
89
+ vad_threshold: 0.5,
90
+ progress_callback: (progress) => console.log(`Progress: ${progress}%`)
91
+ };
92
+
93
+ whisperAsync(vadParams).then(result => console.log(result));
94
+ ```
95
+
96
+ ## Supported Parameters
97
+
98
+ Both traditional whisper.cpp parameters and new VAD parameters are supported:
99
 
100
+ - `language`: Language code (e.g., "en", "es", "fr")
101
+ - `model`: Path to whisper model file
102
+ - `fname_inp`: Path to input audio file
103
+ - `use_gpu`: Enable GPU acceleration (default: true)
104
+ - `flash_attn`: Enable flash attention (default: false)
105
+ - `no_prints`: Disable console output (default: false)
106
+ - `no_timestamps`: Disable timestamps (default: false)
107
+ - `detect_language`: Auto-detect language (default: false)
108
+ - `audio_ctx`: Audio context size (default: 0)
109
+ - `max_len`: Maximum segment length (default: 0)
110
+ - `max_context`: Maximum context size (default: -1)
111
+ - `prompt`: Initial prompt for decoder
112
+ - `comma_in_time`: Use comma in timestamps (default: true)
113
+ - `print_progress`: Print progress info (default: false)
114
+ - `progress_callback`: Progress callback function
115
+ - VAD parameters (see above section)
examples/addon.node/__test__/whisper.spec.js CHANGED
@@ -1,39 +1,133 @@
1
- const path = require("path");
2
- const { whisper } = require(path.join(
3
- __dirname,
4
- "../../../build/Release/addon.node"
5
- ));
6
- const { promisify } = require("util");
7
 
8
  const whisperAsync = promisify(whisper);
9
 
10
- const whisperParamsMock = {
11
- language: "en",
12
- model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
13
- fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
14
  use_gpu: true,
15
  flash_attn: false,
16
  no_prints: true,
17
- comma_in_time: false,
18
- translate: true,
19
  no_timestamps: false,
20
  detect_language: false,
21
  audio_ctx: 0,
22
- max_len: 0,
23
- prompt: "",
24
- print_progress: false,
25
- progress_callback: (progress) => {
26
- console.log(`Progress: ${progress}`);
27
- },
28
- max_context: -1
29
  };
30
 
31
- describe("Run whisper.node", () => {
32
- test("it should receive a non-empty value", async () => {
33
- let result = await whisperAsync(whisperParamsMock);
34
- console.log(result);
 
 
35
 
36
- expect(result['transcription'].length).toBeGreaterThan(0);
37
- }, 10000);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  });
39
 
 
1
+ const { join } = require('path');
2
+ const { whisper } = require('../../../build/Release/addon.node');
3
+ const { promisify } = require('util');
 
 
 
4
 
5
  const whisperAsync = promisify(whisper);
6
 
7
+ const commonParams = {
8
+ language: 'en',
9
+ model: join(__dirname, '../../../models/ggml-base.en.bin'),
10
+ fname_inp: join(__dirname, '../../../samples/jfk.wav'),
11
  use_gpu: true,
12
  flash_attn: false,
13
  no_prints: true,
 
 
14
  no_timestamps: false,
15
  detect_language: false,
16
  audio_ctx: 0,
17
+ max_len: 0
 
 
 
 
 
 
18
  };
19
 
20
+ describe('Whisper.cpp Node.js addon with VAD support', () => {
21
+ test('Basic whisper transcription without VAD', async () => {
22
+ const params = {
23
+ ...commonParams,
24
+ vad: false
25
+ };
26
 
27
+ const result = await whisperAsync(params);
28
+
29
+ expect(typeof result).toBe('object');
30
+ expect(Array.isArray(result.transcription)).toBe(true);
31
+ expect(result.transcription.length).toBeGreaterThan(0);
32
+
33
+ // Check that we got some transcription text
34
+ const text = result.transcription.map(segment => segment[2]).join(' ');
35
+ expect(text.length).toBeGreaterThan(0);
36
+ expect(text.toLowerCase()).toContain('ask not');
37
+ }, 30000);
38
+
39
+ test('VAD parameters validation', async () => {
40
+ // Test with invalid VAD model - should return empty transcription
41
+ const invalidParams = {
42
+ ...commonParams,
43
+ vad: true,
44
+ vad_model: 'non-existent-model.bin',
45
+ vad_threshold: 0.5
46
+ };
47
+
48
+ // This should handle the error gracefully and return empty transcription
49
+ const result = await whisperAsync(invalidParams);
50
+ expect(typeof result).toBe('object');
51
+ expect(Array.isArray(result.transcription)).toBe(true);
52
+ // When VAD model doesn't exist, it should return empty transcription
53
+ expect(result.transcription.length).toBe(0);
54
+ }, 10000);
55
+
56
+ test('VAD parameter parsing', async () => {
57
+ // Test that VAD parameters are properly parsed (even if VAD model doesn't exist)
58
+ const vadParams = {
59
+ ...commonParams,
60
+ vad: false, // Disabled so no model required
61
+ vad_threshold: 0.7,
62
+ vad_min_speech_duration_ms: 300,
63
+ vad_min_silence_duration_ms: 150,
64
+ vad_max_speech_duration_s: 45.0,
65
+ vad_speech_pad_ms: 50,
66
+ vad_samples_overlap: 0.15
67
+ };
68
+
69
+ const result = await whisperAsync(vadParams);
70
+
71
+ expect(typeof result).toBe('object');
72
+ expect(Array.isArray(result.transcription)).toBe(true);
73
+ }, 30000);
74
+
75
+ test('Progress callback with VAD disabled', async () => {
76
+ let progressCalled = false;
77
+ let lastProgress = 0;
78
+
79
+ const params = {
80
+ ...commonParams,
81
+ vad: false,
82
+ progress_callback: (progress) => {
83
+ progressCalled = true;
84
+ lastProgress = progress;
85
+ expect(progress).toBeGreaterThanOrEqual(0);
86
+ expect(progress).toBeLessThanOrEqual(100);
87
+ }
88
+ };
89
+
90
+ const result = await whisperAsync(params);
91
+
92
+ expect(progressCalled).toBe(true);
93
+ expect(lastProgress).toBe(100);
94
+ expect(typeof result).toBe('object');
95
+ }, 30000);
96
+
97
+ test('Language detection without VAD', async () => {
98
+ const params = {
99
+ ...commonParams,
100
+ vad: false,
101
+ detect_language: true,
102
+ language: 'auto'
103
+ };
104
+
105
+ const result = await whisperAsync(params);
106
+
107
+ expect(typeof result).toBe('object');
108
+ expect(typeof result.language).toBe('string');
109
+ expect(result.language.length).toBeGreaterThan(0);
110
+ }, 30000);
111
+
112
+ test('Basic transcription with all VAD parameters set', async () => {
113
+ // Test with VAD disabled but all parameters set to ensure no crashes
114
+ const params = {
115
+ ...commonParams,
116
+ vad: false, // Disabled so it works without VAD model
117
+ vad_model: '', // Empty model path
118
+ vad_threshold: 0.6,
119
+ vad_min_speech_duration_ms: 200,
120
+ vad_min_silence_duration_ms: 80,
121
+ vad_max_speech_duration_s: 25.0,
122
+ vad_speech_pad_ms: 40,
123
+ vad_samples_overlap: 0.08
124
+ };
125
+
126
+ const result = await whisperAsync(params);
127
+
128
+ expect(typeof result).toBe('object');
129
+ expect(Array.isArray(result.transcription)).toBe(true);
130
+ expect(result.transcription.length).toBeGreaterThan(0);
131
+ }, 30000);
132
  });
133
 
examples/addon.node/addon.cpp CHANGED
@@ -9,6 +9,7 @@
9
  #include <vector>
10
  #include <cmath>
11
  #include <cstdint>
 
12
 
13
  struct whisper_params {
14
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
@@ -51,6 +52,16 @@ struct whisper_params {
51
  std::vector<std::string> fname_out = {};
52
 
53
  std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
 
 
 
 
 
 
 
 
 
 
54
  };
55
 
56
  struct whisper_print_user_data {
@@ -333,16 +344,16 @@ class ProgressWorker : public Napi::AsyncWorker {
333
  };
334
  wparams.progress_callback_user_data = this;
335
 
336
- // Abort mechanism example
337
- {
338
- static bool is_aborted = false; // Note: this should be atomic to avoid data races
339
 
340
- wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
341
- bool is_aborted = *(bool*)user_data;
342
- return !is_aborted;
343
- };
344
- wparams.encoder_begin_callback_user_data = &is_aborted;
345
- }
346
 
347
  if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
348
  fprintf(stderr, "failed to process audio\n");
@@ -385,14 +396,46 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
385
  std::string language = whisper_params.Get("language").As<Napi::String>();
386
  std::string model = whisper_params.Get("model").As<Napi::String>();
387
  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
388
- bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
389
- bool flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
390
- bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
391
- bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
392
- bool detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
393
- int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
394
- bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
395
- int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  // Add support for max_context
398
  int32_t max_context = -1;
@@ -408,7 +451,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
408
 
409
  // Add support for print_progress
410
  bool print_progress = false;
411
- if (whisper_params.Has("print_progress")) {
412
  print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
413
  }
414
  // Add support for progress_callback
@@ -417,6 +460,47 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
417
  progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
418
  }
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
421
  std::vector<float> pcmf32_vec;
422
  if (pcmf32Value.IsTypedArray()) {
@@ -444,6 +528,16 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
444
  params.prompt = prompt;
445
  params.detect_language = detect_language;
446
 
 
 
 
 
 
 
 
 
 
 
447
  Napi::Function callback = info[1].As<Napi::Function>();
448
  // Create a new Worker class with progress callback support
449
  ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
 
9
  #include <vector>
10
  #include <cmath>
11
  #include <cstdint>
12
+ #include <cfloat>
13
 
14
  struct whisper_params {
15
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
 
52
  std::vector<std::string> fname_out = {};
53
 
54
  std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
55
+
56
+ // Voice Activity Detection (VAD) parameters
57
+ bool vad = false;
58
+ std::string vad_model = "";
59
+ float vad_threshold = 0.5f;
60
+ int vad_min_speech_duration_ms = 250;
61
+ int vad_min_silence_duration_ms = 100;
62
+ float vad_max_speech_duration_s = FLT_MAX;
63
+ int vad_speech_pad_ms = 30;
64
+ float vad_samples_overlap = 0.1f;
65
  };
66
 
67
  struct whisper_print_user_data {
 
344
  };
345
  wparams.progress_callback_user_data = this;
346
 
347
+ // Set VAD parameters
348
+ wparams.vad = params.vad;
349
+ wparams.vad_model_path = params.vad_model.c_str();
350
 
351
+ wparams.vad_params.threshold = params.vad_threshold;
352
+ wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
353
+ wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
354
+ wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
355
+ wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
356
+ wparams.vad_params.samples_overlap = params.vad_samples_overlap;
357
 
358
  if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
359
  fprintf(stderr, "failed to process audio\n");
 
396
  std::string language = whisper_params.Get("language").As<Napi::String>();
397
  std::string model = whisper_params.Get("model").As<Napi::String>();
398
  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
399
+
400
+ bool use_gpu = true;
401
+ if (whisper_params.Has("use_gpu") && whisper_params.Get("use_gpu").IsBoolean()) {
402
+ use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
403
+ }
404
+
405
+ bool flash_attn = false;
406
+ if (whisper_params.Has("flash_attn") && whisper_params.Get("flash_attn").IsBoolean()) {
407
+ flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
408
+ }
409
+
410
+ bool no_prints = false;
411
+ if (whisper_params.Has("no_prints") && whisper_params.Get("no_prints").IsBoolean()) {
412
+ no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
413
+ }
414
+
415
+ bool no_timestamps = false;
416
+ if (whisper_params.Has("no_timestamps") && whisper_params.Get("no_timestamps").IsBoolean()) {
417
+ no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
418
+ }
419
+
420
+ bool detect_language = false;
421
+ if (whisper_params.Has("detect_language") && whisper_params.Get("detect_language").IsBoolean()) {
422
+ detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
423
+ }
424
+
425
+ int32_t audio_ctx = 0;
426
+ if (whisper_params.Has("audio_ctx") && whisper_params.Get("audio_ctx").IsNumber()) {
427
+ audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
428
+ }
429
+
430
+ bool comma_in_time = true;
431
+ if (whisper_params.Has("comma_in_time") && whisper_params.Get("comma_in_time").IsBoolean()) {
432
+ comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
433
+ }
434
+
435
+ int32_t max_len = 0;
436
+ if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) {
437
+ max_len = whisper_params.Get("max_len").As<Napi::Number>();
438
+ }
439
 
440
  // Add support for max_context
441
  int32_t max_context = -1;
 
451
 
452
  // Add support for print_progress
453
  bool print_progress = false;
454
+ if (whisper_params.Has("print_progress") && whisper_params.Get("print_progress").IsBoolean()) {
455
  print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
456
  }
457
  // Add support for progress_callback
 
460
  progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
461
  }
462
 
463
+ // Add support for VAD parameters
464
+ bool vad = false;
465
+ if (whisper_params.Has("vad") && whisper_params.Get("vad").IsBoolean()) {
466
+ vad = whisper_params.Get("vad").As<Napi::Boolean>();
467
+ }
468
+
469
+ std::string vad_model = "";
470
+ if (whisper_params.Has("vad_model") && whisper_params.Get("vad_model").IsString()) {
471
+ vad_model = whisper_params.Get("vad_model").As<Napi::String>();
472
+ }
473
+
474
+ float vad_threshold = 0.5f;
475
+ if (whisper_params.Has("vad_threshold") && whisper_params.Get("vad_threshold").IsNumber()) {
476
+ vad_threshold = whisper_params.Get("vad_threshold").As<Napi::Number>();
477
+ }
478
+
479
+ int vad_min_speech_duration_ms = 250;
480
+ if (whisper_params.Has("vad_min_speech_duration_ms") && whisper_params.Get("vad_min_speech_duration_ms").IsNumber()) {
481
+ vad_min_speech_duration_ms = whisper_params.Get("vad_min_speech_duration_ms").As<Napi::Number>();
482
+ }
483
+
484
+ int vad_min_silence_duration_ms = 100;
485
+ if (whisper_params.Has("vad_min_silence_duration_ms") && whisper_params.Get("vad_min_silence_duration_ms").IsNumber()) {
486
+ vad_min_silence_duration_ms = whisper_params.Get("vad_min_silence_duration_ms").As<Napi::Number>();
487
+ }
488
+
489
+ float vad_max_speech_duration_s = FLT_MAX;
490
+ if (whisper_params.Has("vad_max_speech_duration_s") && whisper_params.Get("vad_max_speech_duration_s").IsNumber()) {
491
+ vad_max_speech_duration_s = whisper_params.Get("vad_max_speech_duration_s").As<Napi::Number>();
492
+ }
493
+
494
+ int vad_speech_pad_ms = 30;
495
+ if (whisper_params.Has("vad_speech_pad_ms") && whisper_params.Get("vad_speech_pad_ms").IsNumber()) {
496
+ vad_speech_pad_ms = whisper_params.Get("vad_speech_pad_ms").As<Napi::Number>();
497
+ }
498
+
499
+ float vad_samples_overlap = 0.1f;
500
+ if (whisper_params.Has("vad_samples_overlap") && whisper_params.Get("vad_samples_overlap").IsNumber()) {
501
+ vad_samples_overlap = whisper_params.Get("vad_samples_overlap").As<Napi::Number>();
502
+ }
503
+
504
  Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
505
  std::vector<float> pcmf32_vec;
506
  if (pcmf32Value.IsTypedArray()) {
 
528
  params.prompt = prompt;
529
  params.detect_language = detect_language;
530
 
531
+ // Set VAD parameters
532
+ params.vad = vad;
533
+ params.vad_model = vad_model;
534
+ params.vad_threshold = vad_threshold;
535
+ params.vad_min_speech_duration_ms = vad_min_speech_duration_ms;
536
+ params.vad_min_silence_duration_ms = vad_min_silence_duration_ms;
537
+ params.vad_max_speech_duration_s = vad_max_speech_duration_s;
538
+ params.vad_speech_pad_ms = vad_speech_pad_ms;
539
+ params.vad_samples_overlap = vad_samples_overlap;
540
+
541
  Napi::Function callback = info[1].As<Napi::Function>();
542
  // Create a new Worker class with progress callback support
543
  ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
examples/addon.node/vad-example.js ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const path = require("path");
2
+ const { whisper } = require(path.join(
3
+ __dirname,
4
+ "../../build/Release/addon.node"
5
+ ));
6
+ const { promisify } = require("util");
7
+
8
+ const whisperAsync = promisify(whisper);
9
+
10
+ // Example with VAD enabled
11
+ const vadParams = {
12
+ language: "en",
13
+ model: path.join(__dirname, "../../models/ggml-base.en.bin"),
14
+ fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
15
+ use_gpu: true,
16
+ flash_attn: false,
17
+ no_prints: false,
18
+ comma_in_time: true,
19
+ translate: false,
20
+ no_timestamps: false,
21
+ detect_language: false,
22
+ audio_ctx: 0,
23
+ max_len: 0,
24
+ // VAD parameters
25
+ vad: true,
26
+ vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model
27
+ vad_threshold: 0.5,
28
+ vad_min_speech_duration_ms: 250,
29
+ vad_min_silence_duration_ms: 100,
30
+ vad_max_speech_duration_s: 30.0,
31
+ vad_speech_pad_ms: 30,
32
+ vad_samples_overlap: 0.1,
33
+ progress_callback: (progress) => {
34
+ console.log(`VAD Transcription progress: ${progress}%`);
35
+ }
36
+ };
37
+
38
+ // Example without VAD (traditional approach)
39
+ const traditionalParams = {
40
+ language: "en",
41
+ model: path.join(__dirname, "../../models/ggml-base.en.bin"),
42
+ fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
43
+ use_gpu: true,
44
+ flash_attn: false,
45
+ no_prints: false,
46
+ comma_in_time: true,
47
+ translate: false,
48
+ no_timestamps: false,
49
+ detect_language: false,
50
+ audio_ctx: 0,
51
+ max_len: 0,
52
+ vad: false, // Explicitly disable VAD
53
+ progress_callback: (progress) => {
54
+ console.log(`Traditional transcription progress: ${progress}%`);
55
+ }
56
+ };
57
+
58
+ async function runVADExample() {
59
+ try {
60
+ console.log("=== Whisper.cpp Node.js VAD Example ===\n");
61
+
62
+ // Check if VAD model exists
63
+ const fs = require('fs');
64
+ if (!fs.existsSync(vadParams.vad_model)) {
65
+ console.log("⚠️ VAD model not found. Please download the VAD model first:");
66
+ console.log(" ./models/download-vad-model.sh silero-v5.1.2");
67
+ console.log(" Or run: python models/convert-silero-vad-to-ggml.py");
68
+ console.log("\n Falling back to traditional transcription without VAD...\n");
69
+
70
+ // Run without VAD
71
+ console.log("🎵 Running traditional transcription...");
72
+ const traditionalResult = await whisperAsync(traditionalParams);
73
+ console.log("\n📝 Traditional transcription result:");
74
+ console.log(traditionalResult);
75
+ return;
76
+ }
77
+
78
+ console.log("🎵 Running transcription with VAD enabled...");
79
+ console.log("VAD Parameters:");
80
+ console.log(` - Threshold: ${vadParams.vad_threshold}`);
81
+ console.log(` - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`);
82
+ console.log(` - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`);
83
+ console.log(` - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`);
84
+ console.log(` - Speech padding: ${vadParams.vad_speech_pad_ms}ms`);
85
+ console.log(` - Samples overlap: ${vadParams.vad_samples_overlap}\n`);
86
+
87
+ const startTime = Date.now();
88
+ const vadResult = await whisperAsync(vadParams);
89
+ const vadDuration = Date.now() - startTime;
90
+
91
+ console.log("\n✅ VAD transcription completed!");
92
+ console.log(`⏱️ Processing time: ${vadDuration}ms`);
93
+ console.log("\n📝 VAD transcription result:");
94
+ console.log(vadResult);
95
+
96
+ // Compare with traditional approach
97
+ console.log("\n🔄 Running traditional transcription for comparison...");
98
+ const traditionalStartTime = Date.now();
99
+ const traditionalResult = await whisperAsync(traditionalParams);
100
+ const traditionalDuration = Date.now() - traditionalStartTime;
101
+
102
+ console.log("\n✅ Traditional transcription completed!");
103
+ console.log(`⏱️ Processing time: ${traditionalDuration}ms`);
104
+ console.log("\n📝 Traditional transcription result:");
105
+ console.log(traditionalResult);
106
+
107
+ // Performance comparison
108
+ console.log("\n📊 Performance Comparison:");
109
+ console.log(`VAD: ${vadDuration}ms`);
110
+ console.log(`Traditional: ${traditionalDuration}ms`);
111
+ const speedup = traditionalDuration / vadDuration;
112
+ if (speedup > 1) {
113
+ console.log(`🚀 VAD is ${speedup.toFixed(2)}x faster!`);
114
+ } else {
115
+ console.log(`ℹ️ Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`);
116
+ }
117
+
118
+ } catch (error) {
119
+ console.error("❌ Error during transcription:", error);
120
+ }
121
+ }
122
+
123
+ // Run the example
124
+ if (require.main === module) {
125
+ runVADExample();
126
+ }
127
+
128
+ module.exports = {
129
+ runVADExample,
130
+ vadParams,
131
+ traditionalParams
132
+ };