| | import argparse |
| | import os |
| | from pathlib import Path |
| |
|
| | import librosa |
| | import numpy as np |
| | import soundfile as sf |
| | import torch |
| |
|
| | from encoder import inference as encoder |
| | from encoder.params_model import model_embedding_size as speaker_embedding_size |
| | from synthesizer.inference import Synthesizer |
| | from utils.argutils import print_args |
| | from utils.default_models import ensure_default_models |
| | from vocoder import inference as vocoder |
| |
|
| |
|
| | if __name__ == '__main__': |
| | parser = argparse.ArgumentParser( |
| | formatter_class=argparse.ArgumentDefaultsHelpFormatter |
| | ) |
| | parser.add_argument("-e", "--enc_model_fpath", type=Path, |
| | default="saved_models/default/encoder.pt", |
| | help="Path to a saved encoder") |
| | parser.add_argument("-s", "--syn_model_fpath", type=Path, |
| | default="saved_models/default/synthesizer.pt", |
| | help="Path to a saved synthesizer") |
| | parser.add_argument("-v", "--voc_model_fpath", type=Path, |
| | default="saved_models/default/vocoder.pt", |
| | help="Path to a saved vocoder") |
| | parser.add_argument("--cpu", action="store_true", help=\ |
| | "If True, processing is done on CPU, even when a GPU is available.") |
| | parser.add_argument("--no_sound", action="store_true", help=\ |
| | "If True, audio won't be played.") |
| | parser.add_argument("--seed", type=int, default=None, help=\ |
| | "Optional random number seed value to make toolbox deterministic.") |
| | args = parser.parse_args() |
| | arg_dict = vars(args) |
| | print_args(args, parser) |
| |
|
| | |
| | if arg_dict.pop("cpu"): |
| | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" |
| |
|
| | print("Running a test of your configuration...\n") |
| |
|
| | if torch.cuda.is_available(): |
| | device_id = torch.cuda.current_device() |
| | gpu_properties = torch.cuda.get_device_properties(device_id) |
| | |
| | print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " |
| | "%.1fGb total memory.\n" % |
| | (torch.cuda.device_count(), |
| | device_id, |
| | gpu_properties.name, |
| | gpu_properties.major, |
| | gpu_properties.minor, |
| | gpu_properties.total_memory / 1e9)) |
| | else: |
| | print("Using CPU for inference.\n") |
| |
|
| | |
| | print("Preparing the encoder, the synthesizer and the vocoder...") |
| | ensure_default_models(Path("saved_models")) |
| | encoder.load_model(args.enc_model_fpath) |
| | synthesizer = Synthesizer(args.syn_model_fpath) |
| | vocoder.load_model(args.voc_model_fpath) |
| |
|
| |
|
| | |
| | print("Testing your configuration with small inputs.") |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | print("\tTesting the encoder...") |
| | encoder.embed_utterance(np.zeros(encoder.sampling_rate)) |
| |
|
| | |
| | |
| | |
| | embed = np.random.rand(speaker_embedding_size) |
| | |
| | |
| | embed /= np.linalg.norm(embed) |
| | |
| | |
| | embeds = [embed, np.zeros(speaker_embedding_size)] |
| | texts = ["test 1", "test 2"] |
| | print("\tTesting the synthesizer... (loading the model will output a lot of text)") |
| | mels = synthesizer.synthesize_spectrograms(texts, embeds) |
| |
|
| | |
| | |
| | mel = np.concatenate(mels, axis=1) |
| | |
| | |
| | no_action = lambda *args: None |
| | print("\tTesting the vocoder...") |
| | |
| | |
| | |
| | |
| | |
| | |
| | vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) |
| |
|
| | print("All test passed! You can now synthesize speech.\n\n") |
| |
|
| |
|
| | |
| | print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to " |
| | "show how you can interface this project easily with your own. See the source code for " |
| | "an explanation of what is happening.\n") |
| |
|
| | print("Interactive generation loop") |
| | num_generated = 0 |
| | while True: |
| | try: |
| | |
| | message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ |
| | "wav, m4a, flac, ...):\n" |
| | in_fpath = Path(input(message).replace("\"", "").replace("\'", "")) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | preprocessed_wav = encoder.preprocess_wav(in_fpath) |
| | |
| | original_wav, sampling_rate = librosa.load(str(in_fpath)) |
| | preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) |
| | print("Loaded file succesfully") |
| |
|
| | |
| | |
| | |
| | embed = encoder.embed_utterance(preprocessed_wav) |
| | print("Created the embedding") |
| |
|
| |
|
| | |
| | text = input("Write a sentence (+-20 words) to be synthesized:\n") |
| |
|
| | |
| | if args.seed is not None: |
| | torch.manual_seed(args.seed) |
| | synthesizer = Synthesizer(args.syn_model_fpath) |
| |
|
| | |
| | texts = [text] |
| | embeds = [embed] |
| | |
| | |
| | specs = synthesizer.synthesize_spectrograms(texts, embeds) |
| | spec = specs[0] |
| | print("Created the mel spectrogram") |
| |
|
| |
|
| | |
| | print("Synthesizing the waveform:") |
| |
|
| | |
| | if args.seed is not None: |
| | torch.manual_seed(args.seed) |
| | vocoder.load_model(args.voc_model_fpath) |
| |
|
| | |
| | |
| | generated_wav = vocoder.infer_waveform(spec) |
| |
|
| |
|
| | |
| | |
| | |
| | generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") |
| |
|
| | |
| | generated_wav = encoder.preprocess_wav(generated_wav) |
| |
|
| | |
| | if not args.no_sound: |
| | import sounddevice as sd |
| | try: |
| | sd.stop() |
| | sd.play(generated_wav, synthesizer.sample_rate) |
| | except sd.PortAudioError as e: |
| | print("\nCaught exception: %s" % repr(e)) |
| | print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n") |
| | except: |
| | raise |
| |
|
| | |
| | filename = "demo_output_%02d.wav" % num_generated |
| | print(generated_wav.dtype) |
| | sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate) |
| | num_generated += 1 |
| | print("\nSaved output as %s\n\n" % filename) |
| |
|
| |
|
| | except Exception as e: |
| | print("Caught exception: %s" % repr(e)) |
| | print("Restarting\n") |
| |
|