Spaces:
Running
Running
Commit
·
f374409
1
Parent(s):
2b5f9bc
fix: improve audio processing in transcribe function with longer chunk duration and normalization
Browse files
app.py
CHANGED
|
@@ -54,7 +54,6 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
| 54 |
return state, state, audio_buffer, last_processed_time
|
| 55 |
|
| 56 |
print(f"Received audio input of type: {type(audio)}")
|
| 57 |
-
|
| 58 |
if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
|
| 59 |
sample_rate, audio_data = audio
|
| 60 |
print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
|
|
@@ -67,15 +66,16 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
| 67 |
total_duration = total_samples / sample_rate
|
| 68 |
print(f"Total buffered duration: {total_duration:.2f}s")
|
| 69 |
|
| 70 |
-
# Process
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
| 75 |
if total_duration < chunk_duration:
|
| 76 |
print(f"Buffering audio, total duration: {total_duration:.2f}s")
|
| 77 |
return state, state, audio_buffer, last_processed_time
|
| 78 |
-
|
| 79 |
try:
|
| 80 |
# Concatenate buffered chunks
|
| 81 |
full_audio = np.concatenate(audio_buffer)
|
|
@@ -88,7 +88,12 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
| 88 |
else:
|
| 89 |
full_audio = full_audio.astype(float)
|
| 90 |
|
| 91 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
new_state = state
|
| 93 |
current_time = last_processed_time
|
| 94 |
total_samples_16k = len(full_audio)
|
|
@@ -107,6 +112,7 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
| 107 |
sf.write(temp_file, chunk, samplerate=16000)
|
| 108 |
|
| 109 |
# Transcribe
|
|
|
|
| 110 |
hypothesis = model.transcribe([temp_file])[0]
|
| 111 |
transcription = hypothesis.text
|
| 112 |
print(f"Transcription: {transcription}")
|
|
@@ -181,10 +187,14 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
| 181 |
label="Select ASR Model"
|
| 182 |
)
|
| 183 |
with gr.Column(scale=1):
|
| 184 |
-
load_button = gr.Button("Load Selected Model")
|
| 185 |
|
| 186 |
# Status indicator for model loading
|
| 187 |
-
model_status = gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
# Create tabs for real-time and file-based transcription
|
| 190 |
with gr.Tabs():
|
|
@@ -199,7 +209,7 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
| 199 |
label="Speak into your microphone"
|
| 200 |
)
|
| 201 |
|
| 202 |
-
clear_btn = gr.Button("Clear Transcript")
|
| 203 |
|
| 204 |
with gr.Column(scale=3):
|
| 205 |
text_output = gr.Textbox(
|
|
@@ -212,7 +222,7 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
| 212 |
placeholder="Real-time results will appear here...",
|
| 213 |
lines=2
|
| 214 |
)
|
| 215 |
-
|
| 216 |
with gr.TabItem("File Transcription"):
|
| 217 |
with gr.Row():
|
| 218 |
with gr.Column(scale=2):
|
|
@@ -258,7 +268,8 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
| 258 |
inputs=[model_dropdown],
|
| 259 |
outputs=[model_status, audio_buffer, last_processed_time]
|
| 260 |
)
|
| 261 |
-
|
|
|
|
| 262 |
audio_input.stream(
|
| 263 |
fn=transcribe,
|
| 264 |
inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
|
|
@@ -272,16 +283,6 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
| 272 |
outputs=[file_transcription]
|
| 273 |
)
|
| 274 |
|
| 275 |
-
# Clear the transcription
|
| 276 |
-
def clear_transcription():
|
| 277 |
-
return "", "", None, 0
|
| 278 |
-
|
| 279 |
-
clear_btn.click(
|
| 280 |
-
fn=clear_transcription,
|
| 281 |
-
inputs=[],
|
| 282 |
-
outputs=[text_output, streaming_text, audio_buffer, last_processed_time]
|
| 283 |
-
)
|
| 284 |
-
|
| 285 |
# Update the main text output when the state changes
|
| 286 |
state.change(
|
| 287 |
fn=lambda s: s,
|
|
|
|
| 54 |
return state, state, audio_buffer, last_processed_time
|
| 55 |
|
| 56 |
print(f"Received audio input of type: {type(audio)}")
|
|
|
|
| 57 |
if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray):
|
| 58 |
sample_rate, audio_data = audio
|
| 59 |
print(f"Sample rate: {sample_rate}, Audio shape: {audio_data.shape}")
|
|
|
|
| 66 |
total_duration = total_samples / sample_rate
|
| 67 |
print(f"Total buffered duration: {total_duration:.2f}s")
|
| 68 |
|
| 69 |
+
# Process 5-second chunks with 2-second step size (3-second overlap)
|
| 70 |
+
# Using longer chunks usually helps with transcription accuracy
|
| 71 |
+
chunk_duration = 5.0 # seconds (increased from 2.0)
|
| 72 |
+
step_size = 2.0 # seconds (increased from 1.0)
|
| 73 |
+
# min_samples = int(chunk_duration * 16000) # 5s at 16kHz
|
| 74 |
+
|
| 75 |
if total_duration < chunk_duration:
|
| 76 |
print(f"Buffering audio, total duration: {total_duration:.2f}s")
|
| 77 |
return state, state, audio_buffer, last_processed_time
|
| 78 |
+
|
| 79 |
try:
|
| 80 |
# Concatenate buffered chunks
|
| 81 |
full_audio = np.concatenate(audio_buffer)
|
|
|
|
| 88 |
else:
|
| 89 |
full_audio = full_audio.astype(float)
|
| 90 |
|
| 91 |
+
# Normalize audio (helps with consistent volume levels)
|
| 92 |
+
if np.abs(full_audio).max() > 0:
|
| 93 |
+
full_audio = full_audio / np.abs(full_audio).max() * 0.9
|
| 94 |
+
print("Audio normalized to improve transcription")
|
| 95 |
+
|
| 96 |
+
# Process chunks
|
| 97 |
new_state = state
|
| 98 |
current_time = last_processed_time
|
| 99 |
total_samples_16k = len(full_audio)
|
|
|
|
| 112 |
sf.write(temp_file, chunk, samplerate=16000)
|
| 113 |
|
| 114 |
# Transcribe
|
| 115 |
+
print(f"Transcribing chunk of duration {chunk_duration}s...")
|
| 116 |
hypothesis = model.transcribe([temp_file])[0]
|
| 117 |
transcription = hypothesis.text
|
| 118 |
print(f"Transcription: {transcription}")
|
|
|
|
| 187 |
label="Select ASR Model"
|
| 188 |
)
|
| 189 |
with gr.Column(scale=1):
|
| 190 |
+
load_button = gr.Button("Load Selected Model", elem_id="load-button", elem_classes=["btn-blue"])
|
| 191 |
|
| 192 |
# Status indicator for model loading
|
| 193 |
+
model_status = gr.Textbox(
|
| 194 |
+
value=f"Current model: {current_model_name}",
|
| 195 |
+
label="Model Status",
|
| 196 |
+
container=False
|
| 197 |
+
)
|
| 198 |
|
| 199 |
# Create tabs for real-time and file-based transcription
|
| 200 |
with gr.Tabs():
|
|
|
|
| 209 |
label="Speak into your microphone"
|
| 210 |
)
|
| 211 |
|
| 212 |
+
# clear_btn = gr.Button("Clear Transcript")
|
| 213 |
|
| 214 |
with gr.Column(scale=3):
|
| 215 |
text_output = gr.Textbox(
|
|
|
|
| 222 |
placeholder="Real-time results will appear here...",
|
| 223 |
lines=2
|
| 224 |
)
|
| 225 |
+
# File-based transcription tab
|
| 226 |
with gr.TabItem("File Transcription"):
|
| 227 |
with gr.Row():
|
| 228 |
with gr.Column(scale=2):
|
|
|
|
| 268 |
inputs=[model_dropdown],
|
| 269 |
outputs=[model_status, audio_buffer, last_processed_time]
|
| 270 |
)
|
| 271 |
+
# Handle the audio stream for real-time transcription
|
| 272 |
+
streaming_text = gr.State(value="")
|
| 273 |
audio_input.stream(
|
| 274 |
fn=transcribe,
|
| 275 |
inputs=[audio_input, model_dropdown, state, audio_buffer, last_processed_time],
|
|
|
|
| 283 |
outputs=[file_transcription]
|
| 284 |
)
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
# Update the main text output when the state changes
|
| 287 |
state.change(
|
| 288 |
fn=lambda s: s,
|