Spaces:
Running
Running
Lin Xiaodong
linxiaodong
commited on
feat: support vad for addon.node (#3301)
Browse filesCo-authored-by: linxiaodong <[email protected]>
- examples/addon.node/README.md +81 -3
- examples/addon.node/__test__/whisper.spec.js +119 -25
- examples/addon.node/addon.cpp +112 -18
- examples/addon.node/vad-example.js +132 -0
examples/addon.node/README.md
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
-
# addon
|
| 2 |
|
| 3 |
This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
|
| 4 |
It can be used as a reference for using the whisper.cpp project in other node projects.
|
| 5 |
|
|
|
|
|
|
|
| 6 |
## Install
|
| 7 |
|
| 8 |
```shell
|
|
@@ -26,12 +28,88 @@ For Electron addon and cmake-js options, you can see [cmake-js](https://github.c
|
|
| 26 |
|
| 27 |
## Run
|
| 28 |
|
|
|
|
|
|
|
| 29 |
```shell
|
| 30 |
cd examples/addon.node
|
| 31 |
|
| 32 |
node index.js --language='language' --model='model-path' --fname_inp='file-path'
|
| 33 |
```
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# whisper.cpp Node.js addon
|
| 2 |
|
| 3 |
This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
|
| 4 |
It can be used as a reference for using the whisper.cpp project in other node projects.
|
| 5 |
|
| 6 |
+
This addon now supports **Voice Activity Detection (VAD)** for improved transcription performance.
|
| 7 |
+
|
| 8 |
## Install
|
| 9 |
|
| 10 |
```shell
|
|
|
|
| 28 |
|
| 29 |
## Run
|
| 30 |
|
| 31 |
+
### Basic Usage
|
| 32 |
+
|
| 33 |
```shell
|
| 34 |
cd examples/addon.node
|
| 35 |
|
| 36 |
node index.js --language='language' --model='model-path' --fname_inp='file-path'
|
| 37 |
```
|
| 38 |
|
| 39 |
+
### VAD (Voice Activity Detection) Usage
|
| 40 |
+
|
| 41 |
+
Run the VAD example with performance comparison:
|
| 42 |
+
|
| 43 |
+
```shell
|
| 44 |
+
node vad-example.js
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
## Voice Activity Detection (VAD) Support
|
| 48 |
+
|
| 49 |
+
VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence.
|
| 50 |
+
|
| 51 |
+
### VAD Model Setup
|
| 52 |
+
|
| 53 |
+
Before using VAD, download a VAD model:
|
| 54 |
+
|
| 55 |
+
```shell
|
| 56 |
+
# From the whisper.cpp root directory
|
| 57 |
+
./models/download-vad-model.sh silero-v5.1.2
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### VAD Parameters
|
| 61 |
+
|
| 62 |
+
All VAD parameters are optional and have sensible defaults:
|
| 63 |
+
|
| 64 |
+
- `vad`: Enable VAD (default: false)
|
| 65 |
+
- `vad_model`: Path to VAD model file (required when VAD enabled)
|
| 66 |
+
- `vad_threshold`: Speech detection threshold 0.0-1.0 (default: 0.5)
|
| 67 |
+
- `vad_min_speech_duration_ms`: Min speech duration in ms (default: 250)
|
| 68 |
+
- `vad_min_silence_duration_ms`: Min silence duration in ms (default: 100)
|
| 69 |
+
- `vad_max_speech_duration_s`: Max speech duration in seconds (default: FLT_MAX)
|
| 70 |
+
- `vad_speech_pad_ms`: Speech padding in ms (default: 30)
|
| 71 |
+
- `vad_samples_overlap`: Sample overlap 0.0-1.0 (default: 0.1)
|
| 72 |
+
|
| 73 |
+
### JavaScript API Example
|
| 74 |
+
|
| 75 |
+
```javascript
|
| 76 |
+
const path = require("path");
|
| 77 |
+
const { whisper } = require(path.join(__dirname, "../../build/Release/addon.node"));
|
| 78 |
+
const { promisify } = require("util");
|
| 79 |
+
|
| 80 |
+
const whisperAsync = promisify(whisper);
|
| 81 |
+
|
| 82 |
+
// With VAD enabled
|
| 83 |
+
const vadParams = {
|
| 84 |
+
language: "en",
|
| 85 |
+
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
| 86 |
+
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
| 87 |
+
vad: true,
|
| 88 |
+
vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"),
|
| 89 |
+
vad_threshold: 0.5,
|
| 90 |
+
progress_callback: (progress) => console.log(`Progress: ${progress}%`)
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
whisperAsync(vadParams).then(result => console.log(result));
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
## Supported Parameters
|
| 97 |
+
|
| 98 |
+
Both traditional whisper.cpp parameters and new VAD parameters are supported:
|
| 99 |
|
| 100 |
+
- `language`: Language code (e.g., "en", "es", "fr")
|
| 101 |
+
- `model`: Path to whisper model file
|
| 102 |
+
- `fname_inp`: Path to input audio file
|
| 103 |
+
- `use_gpu`: Enable GPU acceleration (default: true)
|
| 104 |
+
- `flash_attn`: Enable flash attention (default: false)
|
| 105 |
+
- `no_prints`: Disable console output (default: false)
|
| 106 |
+
- `no_timestamps`: Disable timestamps (default: false)
|
| 107 |
+
- `detect_language`: Auto-detect language (default: false)
|
| 108 |
+
- `audio_ctx`: Audio context size (default: 0)
|
| 109 |
+
- `max_len`: Maximum segment length (default: 0)
|
| 110 |
+
- `max_context`: Maximum context size (default: -1)
|
| 111 |
+
- `prompt`: Initial prompt for decoder
|
| 112 |
+
- `comma_in_time`: Use comma in timestamps (default: true)
|
| 113 |
+
- `print_progress`: Print progress info (default: false)
|
| 114 |
+
- `progress_callback`: Progress callback function
|
| 115 |
+
- VAD parameters (see above section)
|
examples/addon.node/__test__/whisper.spec.js
CHANGED
|
@@ -1,39 +1,133 @@
|
|
| 1 |
-
const
|
| 2 |
-
const { whisper } = require(
|
| 3 |
-
|
| 4 |
-
"../../../build/Release/addon.node"
|
| 5 |
-
));
|
| 6 |
-
const { promisify } = require("util");
|
| 7 |
|
| 8 |
const whisperAsync = promisify(whisper);
|
| 9 |
|
| 10 |
-
const
|
| 11 |
-
language:
|
| 12 |
-
model:
|
| 13 |
-
fname_inp:
|
| 14 |
use_gpu: true,
|
| 15 |
flash_attn: false,
|
| 16 |
no_prints: true,
|
| 17 |
-
comma_in_time: false,
|
| 18 |
-
translate: true,
|
| 19 |
no_timestamps: false,
|
| 20 |
detect_language: false,
|
| 21 |
audio_ctx: 0,
|
| 22 |
-
max_len: 0
|
| 23 |
-
prompt: "",
|
| 24 |
-
print_progress: false,
|
| 25 |
-
progress_callback: (progress) => {
|
| 26 |
-
console.log(`Progress: ${progress}`);
|
| 27 |
-
},
|
| 28 |
-
max_context: -1
|
| 29 |
};
|
| 30 |
|
| 31 |
-
describe(
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
});
|
| 39 |
|
|
|
|
| 1 |
+
const { join } = require('path');
|
| 2 |
+
const { whisper } = require('../../../build/Release/addon.node');
|
| 3 |
+
const { promisify } = require('util');
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
const whisperAsync = promisify(whisper);
|
| 6 |
|
| 7 |
+
const commonParams = {
|
| 8 |
+
language: 'en',
|
| 9 |
+
model: join(__dirname, '../../../models/ggml-base.en.bin'),
|
| 10 |
+
fname_inp: join(__dirname, '../../../samples/jfk.wav'),
|
| 11 |
use_gpu: true,
|
| 12 |
flash_attn: false,
|
| 13 |
no_prints: true,
|
|
|
|
|
|
|
| 14 |
no_timestamps: false,
|
| 15 |
detect_language: false,
|
| 16 |
audio_ctx: 0,
|
| 17 |
+
max_len: 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
};
|
| 19 |
|
| 20 |
+
describe('Whisper.cpp Node.js addon with VAD support', () => {
|
| 21 |
+
test('Basic whisper transcription without VAD', async () => {
|
| 22 |
+
const params = {
|
| 23 |
+
...commonParams,
|
| 24 |
+
vad: false
|
| 25 |
+
};
|
| 26 |
|
| 27 |
+
const result = await whisperAsync(params);
|
| 28 |
+
|
| 29 |
+
expect(typeof result).toBe('object');
|
| 30 |
+
expect(Array.isArray(result.transcription)).toBe(true);
|
| 31 |
+
expect(result.transcription.length).toBeGreaterThan(0);
|
| 32 |
+
|
| 33 |
+
// Check that we got some transcription text
|
| 34 |
+
const text = result.transcription.map(segment => segment[2]).join(' ');
|
| 35 |
+
expect(text.length).toBeGreaterThan(0);
|
| 36 |
+
expect(text.toLowerCase()).toContain('ask not');
|
| 37 |
+
}, 30000);
|
| 38 |
+
|
| 39 |
+
test('VAD parameters validation', async () => {
|
| 40 |
+
// Test with invalid VAD model - should return empty transcription
|
| 41 |
+
const invalidParams = {
|
| 42 |
+
...commonParams,
|
| 43 |
+
vad: true,
|
| 44 |
+
vad_model: 'non-existent-model.bin',
|
| 45 |
+
vad_threshold: 0.5
|
| 46 |
+
};
|
| 47 |
+
|
| 48 |
+
// This should handle the error gracefully and return empty transcription
|
| 49 |
+
const result = await whisperAsync(invalidParams);
|
| 50 |
+
expect(typeof result).toBe('object');
|
| 51 |
+
expect(Array.isArray(result.transcription)).toBe(true);
|
| 52 |
+
// When VAD model doesn't exist, it should return empty transcription
|
| 53 |
+
expect(result.transcription.length).toBe(0);
|
| 54 |
+
}, 10000);
|
| 55 |
+
|
| 56 |
+
test('VAD parameter parsing', async () => {
|
| 57 |
+
// Test that VAD parameters are properly parsed (even if VAD model doesn't exist)
|
| 58 |
+
const vadParams = {
|
| 59 |
+
...commonParams,
|
| 60 |
+
vad: false, // Disabled so no model required
|
| 61 |
+
vad_threshold: 0.7,
|
| 62 |
+
vad_min_speech_duration_ms: 300,
|
| 63 |
+
vad_min_silence_duration_ms: 150,
|
| 64 |
+
vad_max_speech_duration_s: 45.0,
|
| 65 |
+
vad_speech_pad_ms: 50,
|
| 66 |
+
vad_samples_overlap: 0.15
|
| 67 |
+
};
|
| 68 |
+
|
| 69 |
+
const result = await whisperAsync(vadParams);
|
| 70 |
+
|
| 71 |
+
expect(typeof result).toBe('object');
|
| 72 |
+
expect(Array.isArray(result.transcription)).toBe(true);
|
| 73 |
+
}, 30000);
|
| 74 |
+
|
| 75 |
+
test('Progress callback with VAD disabled', async () => {
|
| 76 |
+
let progressCalled = false;
|
| 77 |
+
let lastProgress = 0;
|
| 78 |
+
|
| 79 |
+
const params = {
|
| 80 |
+
...commonParams,
|
| 81 |
+
vad: false,
|
| 82 |
+
progress_callback: (progress) => {
|
| 83 |
+
progressCalled = true;
|
| 84 |
+
lastProgress = progress;
|
| 85 |
+
expect(progress).toBeGreaterThanOrEqual(0);
|
| 86 |
+
expect(progress).toBeLessThanOrEqual(100);
|
| 87 |
+
}
|
| 88 |
+
};
|
| 89 |
+
|
| 90 |
+
const result = await whisperAsync(params);
|
| 91 |
+
|
| 92 |
+
expect(progressCalled).toBe(true);
|
| 93 |
+
expect(lastProgress).toBe(100);
|
| 94 |
+
expect(typeof result).toBe('object');
|
| 95 |
+
}, 30000);
|
| 96 |
+
|
| 97 |
+
test('Language detection without VAD', async () => {
|
| 98 |
+
const params = {
|
| 99 |
+
...commonParams,
|
| 100 |
+
vad: false,
|
| 101 |
+
detect_language: true,
|
| 102 |
+
language: 'auto'
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
const result = await whisperAsync(params);
|
| 106 |
+
|
| 107 |
+
expect(typeof result).toBe('object');
|
| 108 |
+
expect(typeof result.language).toBe('string');
|
| 109 |
+
expect(result.language.length).toBeGreaterThan(0);
|
| 110 |
+
}, 30000);
|
| 111 |
+
|
| 112 |
+
test('Basic transcription with all VAD parameters set', async () => {
|
| 113 |
+
// Test with VAD disabled but all parameters set to ensure no crashes
|
| 114 |
+
const params = {
|
| 115 |
+
...commonParams,
|
| 116 |
+
vad: false, // Disabled so it works without VAD model
|
| 117 |
+
vad_model: '', // Empty model path
|
| 118 |
+
vad_threshold: 0.6,
|
| 119 |
+
vad_min_speech_duration_ms: 200,
|
| 120 |
+
vad_min_silence_duration_ms: 80,
|
| 121 |
+
vad_max_speech_duration_s: 25.0,
|
| 122 |
+
vad_speech_pad_ms: 40,
|
| 123 |
+
vad_samples_overlap: 0.08
|
| 124 |
+
};
|
| 125 |
+
|
| 126 |
+
const result = await whisperAsync(params);
|
| 127 |
+
|
| 128 |
+
expect(typeof result).toBe('object');
|
| 129 |
+
expect(Array.isArray(result.transcription)).toBe(true);
|
| 130 |
+
expect(result.transcription.length).toBeGreaterThan(0);
|
| 131 |
+
}, 30000);
|
| 132 |
});
|
| 133 |
|
examples/addon.node/addon.cpp
CHANGED
|
@@ -9,6 +9,7 @@
|
|
| 9 |
#include <vector>
|
| 10 |
#include <cmath>
|
| 11 |
#include <cstdint>
|
|
|
|
| 12 |
|
| 13 |
struct whisper_params {
|
| 14 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
|
@@ -51,6 +52,16 @@ struct whisper_params {
|
|
| 51 |
std::vector<std::string> fname_out = {};
|
| 52 |
|
| 53 |
std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
};
|
| 55 |
|
| 56 |
struct whisper_print_user_data {
|
|
@@ -333,16 +344,16 @@ class ProgressWorker : public Napi::AsyncWorker {
|
|
| 333 |
};
|
| 334 |
wparams.progress_callback_user_data = this;
|
| 335 |
|
| 336 |
-
//
|
| 337 |
-
|
| 338 |
-
|
| 339 |
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
|
| 347 |
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
| 348 |
fprintf(stderr, "failed to process audio\n");
|
|
@@ -385,14 +396,46 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
| 385 |
std::string language = whisper_params.Get("language").As<Napi::String>();
|
| 386 |
std::string model = whisper_params.Get("model").As<Napi::String>();
|
| 387 |
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
|
| 388 |
-
|
| 389 |
-
bool
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
bool
|
| 395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
|
| 397 |
// Add support for max_context
|
| 398 |
int32_t max_context = -1;
|
|
@@ -408,7 +451,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
| 408 |
|
| 409 |
// Add support for print_progress
|
| 410 |
bool print_progress = false;
|
| 411 |
-
if (whisper_params.Has("print_progress")) {
|
| 412 |
print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
|
| 413 |
}
|
| 414 |
// Add support for progress_callback
|
|
@@ -417,6 +460,47 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
| 417 |
progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
|
| 418 |
}
|
| 419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
|
| 421 |
std::vector<float> pcmf32_vec;
|
| 422 |
if (pcmf32Value.IsTypedArray()) {
|
|
@@ -444,6 +528,16 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
| 444 |
params.prompt = prompt;
|
| 445 |
params.detect_language = detect_language;
|
| 446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
Napi::Function callback = info[1].As<Napi::Function>();
|
| 448 |
// Create a new Worker class with progress callback support
|
| 449 |
ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
|
|
|
|
| 9 |
#include <vector>
|
| 10 |
#include <cmath>
|
| 11 |
#include <cstdint>
|
| 12 |
+
#include <cfloat>
|
| 13 |
|
| 14 |
struct whisper_params {
|
| 15 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
|
|
|
| 52 |
std::vector<std::string> fname_out = {};
|
| 53 |
|
| 54 |
std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
|
| 55 |
+
|
| 56 |
+
// Voice Activity Detection (VAD) parameters
|
| 57 |
+
bool vad = false;
|
| 58 |
+
std::string vad_model = "";
|
| 59 |
+
float vad_threshold = 0.5f;
|
| 60 |
+
int vad_min_speech_duration_ms = 250;
|
| 61 |
+
int vad_min_silence_duration_ms = 100;
|
| 62 |
+
float vad_max_speech_duration_s = FLT_MAX;
|
| 63 |
+
int vad_speech_pad_ms = 30;
|
| 64 |
+
float vad_samples_overlap = 0.1f;
|
| 65 |
};
|
| 66 |
|
| 67 |
struct whisper_print_user_data {
|
|
|
|
| 344 |
};
|
| 345 |
wparams.progress_callback_user_data = this;
|
| 346 |
|
| 347 |
+
// Set VAD parameters
|
| 348 |
+
wparams.vad = params.vad;
|
| 349 |
+
wparams.vad_model_path = params.vad_model.c_str();
|
| 350 |
|
| 351 |
+
wparams.vad_params.threshold = params.vad_threshold;
|
| 352 |
+
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
|
| 353 |
+
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
|
| 354 |
+
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
|
| 355 |
+
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
|
| 356 |
+
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
|
| 357 |
|
| 358 |
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
| 359 |
fprintf(stderr, "failed to process audio\n");
|
|
|
|
| 396 |
std::string language = whisper_params.Get("language").As<Napi::String>();
|
| 397 |
std::string model = whisper_params.Get("model").As<Napi::String>();
|
| 398 |
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
|
| 399 |
+
|
| 400 |
+
bool use_gpu = true;
|
| 401 |
+
if (whisper_params.Has("use_gpu") && whisper_params.Get("use_gpu").IsBoolean()) {
|
| 402 |
+
use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
bool flash_attn = false;
|
| 406 |
+
if (whisper_params.Has("flash_attn") && whisper_params.Get("flash_attn").IsBoolean()) {
|
| 407 |
+
flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
bool no_prints = false;
|
| 411 |
+
if (whisper_params.Has("no_prints") && whisper_params.Get("no_prints").IsBoolean()) {
|
| 412 |
+
no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
bool no_timestamps = false;
|
| 416 |
+
if (whisper_params.Has("no_timestamps") && whisper_params.Get("no_timestamps").IsBoolean()) {
|
| 417 |
+
no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
bool detect_language = false;
|
| 421 |
+
if (whisper_params.Has("detect_language") && whisper_params.Get("detect_language").IsBoolean()) {
|
| 422 |
+
detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
int32_t audio_ctx = 0;
|
| 426 |
+
if (whisper_params.Has("audio_ctx") && whisper_params.Get("audio_ctx").IsNumber()) {
|
| 427 |
+
audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
bool comma_in_time = true;
|
| 431 |
+
if (whisper_params.Has("comma_in_time") && whisper_params.Get("comma_in_time").IsBoolean()) {
|
| 432 |
+
comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
int32_t max_len = 0;
|
| 436 |
+
if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) {
|
| 437 |
+
max_len = whisper_params.Get("max_len").As<Napi::Number>();
|
| 438 |
+
}
|
| 439 |
|
| 440 |
// Add support for max_context
|
| 441 |
int32_t max_context = -1;
|
|
|
|
| 451 |
|
| 452 |
// Add support for print_progress
|
| 453 |
bool print_progress = false;
|
| 454 |
+
if (whisper_params.Has("print_progress") && whisper_params.Get("print_progress").IsBoolean()) {
|
| 455 |
print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
|
| 456 |
}
|
| 457 |
// Add support for progress_callback
|
|
|
|
| 460 |
progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
|
| 461 |
}
|
| 462 |
|
| 463 |
+
// Add support for VAD parameters
|
| 464 |
+
bool vad = false;
|
| 465 |
+
if (whisper_params.Has("vad") && whisper_params.Get("vad").IsBoolean()) {
|
| 466 |
+
vad = whisper_params.Get("vad").As<Napi::Boolean>();
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
std::string vad_model = "";
|
| 470 |
+
if (whisper_params.Has("vad_model") && whisper_params.Get("vad_model").IsString()) {
|
| 471 |
+
vad_model = whisper_params.Get("vad_model").As<Napi::String>();
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
float vad_threshold = 0.5f;
|
| 475 |
+
if (whisper_params.Has("vad_threshold") && whisper_params.Get("vad_threshold").IsNumber()) {
|
| 476 |
+
vad_threshold = whisper_params.Get("vad_threshold").As<Napi::Number>();
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
int vad_min_speech_duration_ms = 250;
|
| 480 |
+
if (whisper_params.Has("vad_min_speech_duration_ms") && whisper_params.Get("vad_min_speech_duration_ms").IsNumber()) {
|
| 481 |
+
vad_min_speech_duration_ms = whisper_params.Get("vad_min_speech_duration_ms").As<Napi::Number>();
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
int vad_min_silence_duration_ms = 100;
|
| 485 |
+
if (whisper_params.Has("vad_min_silence_duration_ms") && whisper_params.Get("vad_min_silence_duration_ms").IsNumber()) {
|
| 486 |
+
vad_min_silence_duration_ms = whisper_params.Get("vad_min_silence_duration_ms").As<Napi::Number>();
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
float vad_max_speech_duration_s = FLT_MAX;
|
| 490 |
+
if (whisper_params.Has("vad_max_speech_duration_s") && whisper_params.Get("vad_max_speech_duration_s").IsNumber()) {
|
| 491 |
+
vad_max_speech_duration_s = whisper_params.Get("vad_max_speech_duration_s").As<Napi::Number>();
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
int vad_speech_pad_ms = 30;
|
| 495 |
+
if (whisper_params.Has("vad_speech_pad_ms") && whisper_params.Get("vad_speech_pad_ms").IsNumber()) {
|
| 496 |
+
vad_speech_pad_ms = whisper_params.Get("vad_speech_pad_ms").As<Napi::Number>();
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
float vad_samples_overlap = 0.1f;
|
| 500 |
+
if (whisper_params.Has("vad_samples_overlap") && whisper_params.Get("vad_samples_overlap").IsNumber()) {
|
| 501 |
+
vad_samples_overlap = whisper_params.Get("vad_samples_overlap").As<Napi::Number>();
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
|
| 505 |
std::vector<float> pcmf32_vec;
|
| 506 |
if (pcmf32Value.IsTypedArray()) {
|
|
|
|
| 528 |
params.prompt = prompt;
|
| 529 |
params.detect_language = detect_language;
|
| 530 |
|
| 531 |
+
// Set VAD parameters
|
| 532 |
+
params.vad = vad;
|
| 533 |
+
params.vad_model = vad_model;
|
| 534 |
+
params.vad_threshold = vad_threshold;
|
| 535 |
+
params.vad_min_speech_duration_ms = vad_min_speech_duration_ms;
|
| 536 |
+
params.vad_min_silence_duration_ms = vad_min_silence_duration_ms;
|
| 537 |
+
params.vad_max_speech_duration_s = vad_max_speech_duration_s;
|
| 538 |
+
params.vad_speech_pad_ms = vad_speech_pad_ms;
|
| 539 |
+
params.vad_samples_overlap = vad_samples_overlap;
|
| 540 |
+
|
| 541 |
Napi::Function callback = info[1].As<Napi::Function>();
|
| 542 |
// Create a new Worker class with progress callback support
|
| 543 |
ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
|
examples/addon.node/vad-example.js
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const path = require("path");
|
| 2 |
+
const { whisper } = require(path.join(
|
| 3 |
+
__dirname,
|
| 4 |
+
"../../build/Release/addon.node"
|
| 5 |
+
));
|
| 6 |
+
const { promisify } = require("util");
|
| 7 |
+
|
| 8 |
+
const whisperAsync = promisify(whisper);
|
| 9 |
+
|
| 10 |
+
// Example with VAD enabled
|
| 11 |
+
const vadParams = {
|
| 12 |
+
language: "en",
|
| 13 |
+
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
| 14 |
+
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
| 15 |
+
use_gpu: true,
|
| 16 |
+
flash_attn: false,
|
| 17 |
+
no_prints: false,
|
| 18 |
+
comma_in_time: true,
|
| 19 |
+
translate: false,
|
| 20 |
+
no_timestamps: false,
|
| 21 |
+
detect_language: false,
|
| 22 |
+
audio_ctx: 0,
|
| 23 |
+
max_len: 0,
|
| 24 |
+
// VAD parameters
|
| 25 |
+
vad: true,
|
| 26 |
+
vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model
|
| 27 |
+
vad_threshold: 0.5,
|
| 28 |
+
vad_min_speech_duration_ms: 250,
|
| 29 |
+
vad_min_silence_duration_ms: 100,
|
| 30 |
+
vad_max_speech_duration_s: 30.0,
|
| 31 |
+
vad_speech_pad_ms: 30,
|
| 32 |
+
vad_samples_overlap: 0.1,
|
| 33 |
+
progress_callback: (progress) => {
|
| 34 |
+
console.log(`VAD Transcription progress: ${progress}%`);
|
| 35 |
+
}
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
// Example without VAD (traditional approach)
|
| 39 |
+
const traditionalParams = {
|
| 40 |
+
language: "en",
|
| 41 |
+
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
|
| 42 |
+
fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
|
| 43 |
+
use_gpu: true,
|
| 44 |
+
flash_attn: false,
|
| 45 |
+
no_prints: false,
|
| 46 |
+
comma_in_time: true,
|
| 47 |
+
translate: false,
|
| 48 |
+
no_timestamps: false,
|
| 49 |
+
detect_language: false,
|
| 50 |
+
audio_ctx: 0,
|
| 51 |
+
max_len: 0,
|
| 52 |
+
vad: false, // Explicitly disable VAD
|
| 53 |
+
progress_callback: (progress) => {
|
| 54 |
+
console.log(`Traditional transcription progress: ${progress}%`);
|
| 55 |
+
}
|
| 56 |
+
};
|
| 57 |
+
|
| 58 |
+
async function runVADExample() {
|
| 59 |
+
try {
|
| 60 |
+
console.log("=== Whisper.cpp Node.js VAD Example ===\n");
|
| 61 |
+
|
| 62 |
+
// Check if VAD model exists
|
| 63 |
+
const fs = require('fs');
|
| 64 |
+
if (!fs.existsSync(vadParams.vad_model)) {
|
| 65 |
+
console.log("⚠️ VAD model not found. Please download the VAD model first:");
|
| 66 |
+
console.log(" ./models/download-vad-model.sh silero-v5.1.2");
|
| 67 |
+
console.log(" Or run: python models/convert-silero-vad-to-ggml.py");
|
| 68 |
+
console.log("\n Falling back to traditional transcription without VAD...\n");
|
| 69 |
+
|
| 70 |
+
// Run without VAD
|
| 71 |
+
console.log("🎵 Running traditional transcription...");
|
| 72 |
+
const traditionalResult = await whisperAsync(traditionalParams);
|
| 73 |
+
console.log("\n📝 Traditional transcription result:");
|
| 74 |
+
console.log(traditionalResult);
|
| 75 |
+
return;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
console.log("🎵 Running transcription with VAD enabled...");
|
| 79 |
+
console.log("VAD Parameters:");
|
| 80 |
+
console.log(` - Threshold: ${vadParams.vad_threshold}`);
|
| 81 |
+
console.log(` - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`);
|
| 82 |
+
console.log(` - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`);
|
| 83 |
+
console.log(` - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`);
|
| 84 |
+
console.log(` - Speech padding: ${vadParams.vad_speech_pad_ms}ms`);
|
| 85 |
+
console.log(` - Samples overlap: ${vadParams.vad_samples_overlap}\n`);
|
| 86 |
+
|
| 87 |
+
const startTime = Date.now();
|
| 88 |
+
const vadResult = await whisperAsync(vadParams);
|
| 89 |
+
const vadDuration = Date.now() - startTime;
|
| 90 |
+
|
| 91 |
+
console.log("\n✅ VAD transcription completed!");
|
| 92 |
+
console.log(`⏱️ Processing time: ${vadDuration}ms`);
|
| 93 |
+
console.log("\n📝 VAD transcription result:");
|
| 94 |
+
console.log(vadResult);
|
| 95 |
+
|
| 96 |
+
// Compare with traditional approach
|
| 97 |
+
console.log("\n🔄 Running traditional transcription for comparison...");
|
| 98 |
+
const traditionalStartTime = Date.now();
|
| 99 |
+
const traditionalResult = await whisperAsync(traditionalParams);
|
| 100 |
+
const traditionalDuration = Date.now() - traditionalStartTime;
|
| 101 |
+
|
| 102 |
+
console.log("\n✅ Traditional transcription completed!");
|
| 103 |
+
console.log(`⏱️ Processing time: ${traditionalDuration}ms`);
|
| 104 |
+
console.log("\n📝 Traditional transcription result:");
|
| 105 |
+
console.log(traditionalResult);
|
| 106 |
+
|
| 107 |
+
// Performance comparison
|
| 108 |
+
console.log("\n📊 Performance Comparison:");
|
| 109 |
+
console.log(`VAD: ${vadDuration}ms`);
|
| 110 |
+
console.log(`Traditional: ${traditionalDuration}ms`);
|
| 111 |
+
const speedup = traditionalDuration / vadDuration;
|
| 112 |
+
if (speedup > 1) {
|
| 113 |
+
console.log(`🚀 VAD is ${speedup.toFixed(2)}x faster!`);
|
| 114 |
+
} else {
|
| 115 |
+
console.log(`ℹ️ Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
} catch (error) {
|
| 119 |
+
console.error("❌ Error during transcription:", error);
|
| 120 |
+
}
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
// Run the example
|
| 124 |
+
if (require.main === module) {
|
| 125 |
+
runVADExample();
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
module.exports = {
|
| 129 |
+
runVADExample,
|
| 130 |
+
vadParams,
|
| 131 |
+
traditionalParams
|
| 132 |
+
};
|