Spaces:
Sleeping
Sleeping
File size: 5,514 Bytes
1be582a 2418415 1be582a fdc056d 1be582a 22eccbb 1be582a 22eccbb 1be582a fdc056d 1be582a 22eccbb 1be582a 22eccbb 1be582a 22eccbb b10a453 22eccbb 1be582a fdc056d 1be582a 22eccbb 1be582a fdc056d 1be582a 22eccbb 1be582a 22eccbb 1be582a fdc056d 1be582a 7ffd9ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
"""Whisper STT provider implementation."""
import logging
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from ...domain.models.audio_content import AudioContent
from ...domain.models.text_content import TextContent
from ..base.stt_provider_base import STTProviderBase
from ...domain.exceptions import SpeechRecognitionException
logger = logging.getLogger(__name__)
class WhisperSTTProvider(STTProviderBase):
"""Whisper STT provider using faster-whisper implementation."""
def __init__(self):
"""Initialize the Whisper STT provider."""
super().__init__(
provider_name="Whisper",
supported_languages=["en", "zh"]
)
self.model = None
self._device = None
self._compute_type = None
self._initialize_device_settings()
def _initialize_device_settings(self):
"""Initialize device and compute type settings."""
try:
import torch
self._device = "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
# Fallback to CPU if torch is not available
self._device = "cpu"
self._compute_type = "float16" if self._device == "cuda" else "int8"
logger.info(f"Whisper provider initialized with device: {self._device}, compute_type: {self._compute_type}")
def _perform_transcription(self, audio_path: Path, model: str) -> str:
"""
Perform transcription using Faster Whisper.
Args:
audio_path: Path to the preprocessed audio file
model: The model name to use
Returns:
str: The transcribed text
"""
try:
# Lazy load model if not already loaded
if self.model is None:
self._load_model(model)
# Perform transcription
segments, info = self.model.transcribe(
str(audio_path),
beam_size=5,
language="en", # Can be made configurable
task="transcribe"
)
logger.info(f"Detected language '{info.language}' with probability {info.language_probability}")
# Collect all segments into a single text
result_text = ""
for segment in segments:
result_text += segment.text + " "
logger.info(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
result = result_text.strip()
logger.info("Whisper transcription completed successfully")
return result
except Exception as e:
self._handle_provider_error(e, "transcription")
def _load_model(self, model_name: str):
"""
Load the Whisper model based on the requested model name.
Args:
model_name: The requested model name (e.g., "whisper-large")
"""
try:
from faster_whisper import WhisperModel as FasterWhisperModel
# Map requested model to actual faster-whisper model
model_mapping = {
"whisper-large": "large-v3",
"whisper-large-v1": "large-v1",
"whisper-large-v2": "large-v2",
"whisper-large-v3": "large-v3",
"whisper-medium": "medium",
"whisper-medium.en": "medium.en",
"whisper-small": "small",
"whisper-small.en": "small.en",
"whisper-base": "base",
"whisper-base.en": "base.en",
"whisper-tiny": "tiny",
"whisper-tiny.en": "tiny.en",
}
actual_model = model_mapping.get(model_name.lower(), "large-v3")
logger.info(f"Loading Whisper model: {actual_model} (requested: {model_name})")
logger.info(f"Using device: {self._device}, compute_type: {self._compute_type}")
self.model = FasterWhisperModel(
actual_model,
device=self._device,
compute_type=self._compute_type
)
except ImportError as e:
raise SpeechRecognitionException(
"faster-whisper not available. Please install with: uv add faster-whisper"
) from e
except Exception as e:
raise SpeechRecognitionException(f"Failed to load Whisper model '{actual_model}' (requested: {model_name})") from e
def is_available(self) -> bool:
"""
Check if the Whisper provider is available.
Returns:
bool: True if faster-whisper is available, False otherwise
"""
try:
import faster_whisper
return True
except ImportError:
logger.warning("faster-whisper not available")
return False
def get_available_models(self) -> list[str]:
"""
Get list of available Whisper models.
Returns:
list[str]: List of available model names
"""
return [
"tiny",
"tiny.en",
"base",
"base.en",
"small",
"small.en",
"medium",
"medium.en",
"large-v1",
"large-v2",
"large-v3"
]
def get_default_model(self) -> str:
"""
Get the default model for this provider.
Returns:
str: Default model name
"""
return "whisper-medium" |