Spaces:
Running
talk, talk-llama : pass text_to_speak as a file (#1865)
Browse files* talk-llama: pass file instead of arg
it is too hard to quote text in a portable way
* talk-llama: pass heard_ok as a file
* talk-llama: let eleven-labs.py accept options
Options: -v voice, -s savefile, -p (--play)
* talk-llama: check installed commands in "speak"
Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed
* talk-llama: pass voice_id again
in order to sync talk with talk-llama
* talk: sync with talk-llama
Passing text_to_speak as a file is safer and more portable
cf. https://stackoverflow.com/a/59036879/45375
* talk and talk-llama: get all installed voices in speak.ps1
* talk and talk-llama: get voices from api
* talk and talk-llama: add more options to eleven-labs.py
and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/)
```
usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE]
options:
-q, --quick skip checking the required library
action:
TEXTFILE read the text file (default: stdin)
-l, --list show the list of voices and exit
-h, --help show this help and exit
voice selection:
-n NAME, --name NAME get a voice object by name (default: Arnold)
-v NUMBER, --voice NUMBER
get a voice object by number (see --list)
-f KEY=VAL, --filter KEY=VAL
filter voices by labels (default: "use case=narration")
this option can be used multiple times
filtering will be disabled if the first -f has no "=" (e.g. -f "any")
output:
-s FILE, --save FILE save the TTS to a file (default: audio.mp3)
-p, --play play the TTS with ffplay
```
* examples: add speak_with_file()
as suggested in the review
* talk and talk-llama: ignore to_speak.txt
- examples/common.cpp +18 -0
- examples/common.h +3 -0
- examples/talk-llama/.gitignore +1 -0
- examples/talk-llama/eleven-labs.py +73 -13
- examples/talk-llama/speak +33 -25
- examples/talk-llama/speak.bat +1 -1
- examples/talk-llama/speak.ps1 +6 -4
- examples/talk-llama/talk-llama.cpp +5 -9
- examples/talk/.gitignore +1 -0
- examples/talk/eleven-labs.py +73 -13
- examples/talk/speak +33 -17
- examples/talk/speak.ps1 +6 -4
- examples/talk/talk.cpp +5 -5
|
@@ -863,3 +863,21 @@ bool is_file_exist(const char *fileName)
|
|
| 863 |
std::ifstream infile(fileName);
|
| 864 |
return infile.good();
|
| 865 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 863 |
std::ifstream infile(fileName);
|
| 864 |
return infile.good();
|
| 865 |
}
|
| 866 |
+
|
| 867 |
+
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id)
|
| 868 |
+
{
|
| 869 |
+
std::ofstream speak_file(path.c_str());
|
| 870 |
+
if (speak_file.fail()) {
|
| 871 |
+
fprintf(stderr, "%s: failed to open speak_file\n", __func__);
|
| 872 |
+
return false;
|
| 873 |
+
} else {
|
| 874 |
+
speak_file.write(text.c_str(), text.size());
|
| 875 |
+
speak_file.close();
|
| 876 |
+
int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
|
| 877 |
+
if (ret != 0) {
|
| 878 |
+
fprintf(stderr, "%s: failed to speak\n", __func__);
|
| 879 |
+
return false;
|
| 880 |
+
}
|
| 881 |
+
}
|
| 882 |
+
return true;
|
| 883 |
+
}
|
|
@@ -306,3 +306,6 @@ int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate);
|
|
| 306 |
|
| 307 |
// check if file exists using ifstream
|
| 308 |
bool is_file_exist(const char *fileName);
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
// check if file exists using ifstream
|
| 308 |
bool is_file_exist(const char *fileName);
|
| 309 |
+
|
| 310 |
+
// write text to file, and call system("command voice_id file")
|
| 311 |
+
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);
|
|
@@ -1 +1,2 @@
|
|
| 1 |
audio.mp3
|
|
|
|
|
|
| 1 |
audio.mp3
|
| 2 |
+
to_speak.txt
|
|
@@ -1,20 +1,80 @@
|
|
| 1 |
import sys
|
| 2 |
-
import
|
|
|
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
# Generate the TTS
|
| 14 |
audio = generate(
|
| 15 |
-
|
| 16 |
-
|
| 17 |
)
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
| 1 |
import sys
|
| 2 |
+
import argparse
|
| 3 |
+
import textwrap
|
| 4 |
|
| 5 |
+
parser = argparse.ArgumentParser(add_help=False,
|
| 6 |
+
formatter_class=argparse.RawTextHelpFormatter)
|
| 7 |
+
parser.add_argument("-q", "--quick", action="store_true",
|
| 8 |
+
help="skip checking the required library")
|
| 9 |
+
|
| 10 |
+
modes = parser.add_argument_group("action")
|
| 11 |
+
modes.add_argument("inputfile", metavar="TEXTFILE",
|
| 12 |
+
nargs='?', type=argparse.FileType(), default=sys.stdin,
|
| 13 |
+
help="read the text file (default: stdin)")
|
| 14 |
+
modes.add_argument("-l", "--list", action="store_true",
|
| 15 |
+
help="show the list of voices and exit")
|
| 16 |
+
modes.add_argument("-h", "--help", action="help",
|
| 17 |
+
help="show this help and exit")
|
| 18 |
+
|
| 19 |
+
selopts = parser.add_argument_group("voice selection")
|
| 20 |
+
selmodes = selopts.add_mutually_exclusive_group()
|
| 21 |
+
selmodes.add_argument("-n", "--name",
|
| 22 |
+
default="Arnold",
|
| 23 |
+
help="get a voice object by name (default: Arnold)")
|
| 24 |
+
selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
|
| 25 |
+
help="get a voice object by number (see --list)")
|
| 26 |
+
selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
|
| 27 |
+
default=["use case=narration"],
|
| 28 |
+
help=textwrap.dedent('''\
|
| 29 |
+
filter voices by labels (default: "use case=narration")
|
| 30 |
+
this option can be used multiple times
|
| 31 |
+
filtering will be disabled if the first -f has no "=" (e.g. -f "any")
|
| 32 |
+
'''))
|
| 33 |
+
|
| 34 |
+
outmodes = parser.add_argument_group("output")
|
| 35 |
+
outgroup = outmodes.add_mutually_exclusive_group()
|
| 36 |
+
outgroup.add_argument("-s", "--save", metavar="FILE",
|
| 37 |
+
default="audio.mp3",
|
| 38 |
+
help="save the TTS to a file (default: audio.mp3)")
|
| 39 |
+
outgroup.add_argument("-p", "--play", action="store_true",
|
| 40 |
+
help="play the TTS with ffplay")
|
| 41 |
+
|
| 42 |
+
args = parser.parse_args()
|
| 43 |
|
| 44 |
+
if not args.quick:
|
| 45 |
+
import importlib.util
|
| 46 |
+
if importlib.util.find_spec("elevenlabs") is None:
|
| 47 |
+
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
|
| 48 |
+
sys.exit()
|
| 49 |
|
| 50 |
+
from elevenlabs import voices, generate, play, save
|
| 51 |
+
|
| 52 |
+
if args.filter and "=" in args.filter[0]:
|
| 53 |
+
voicelist = voices()
|
| 54 |
+
for f in args.filter:
|
| 55 |
+
label, value = f.split("=")
|
| 56 |
+
voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
|
| 57 |
+
voicelist = list(voicelist)
|
| 58 |
+
else:
|
| 59 |
+
voicelist = list(voices())
|
| 60 |
+
|
| 61 |
+
if args.list:
|
| 62 |
+
for i, v in enumerate(voicelist):
|
| 63 |
+
print(str(i) + ": " + v.name + " " + str(v.labels))
|
| 64 |
+
sys.exit()
|
| 65 |
+
|
| 66 |
+
if args.voice:
|
| 67 |
+
voice = voicelist[args.voice % len(voicelist)]
|
| 68 |
+
else:
|
| 69 |
+
voice = args.name
|
| 70 |
+
# if -n should consult -f, use the following
|
| 71 |
+
#voice = next(x for x in voicelist if x.name == args.name)
|
| 72 |
|
|
|
|
| 73 |
audio = generate(
|
| 74 |
+
text=str(args.inputfile.read()),
|
| 75 |
+
voice=voice
|
| 76 |
)
|
| 77 |
+
if args.play:
|
| 78 |
+
play(audio)
|
| 79 |
+
else:
|
| 80 |
+
save(audio, args.save)
|
|
@@ -1,32 +1,40 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
# Usage:
|
| 4 |
-
# speak
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
#
|
| 14 |
-
# https://github.com/rhasspy/piper
|
| 15 |
-
#
|
| 16 |
-
# Tested with Linux:
|
| 17 |
-
#
|
| 18 |
-
#echo "$2" | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
|
| 19 |
|
| 20 |
# for Mac
|
| 21 |
-
say
|
|
|
|
| 22 |
|
| 23 |
# Eleven Labs
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
#
|
| 28 |
-
#
|
| 29 |
-
#
|
| 30 |
-
#
|
| 31 |
-
#
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
# Usage:
|
| 4 |
+
# speak <voice_id> <textfile>
|
| 5 |
+
|
| 6 |
+
function installed() { command -v $1 >/dev/null 2>&1; }
|
| 7 |
+
|
| 8 |
+
if installed espeak; then
|
| 9 |
+
espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
|
| 10 |
+
|
| 11 |
+
elif installed piper && installed aplay; then
|
| 12 |
+
cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# for Mac
|
| 15 |
+
elif installed say; then
|
| 16 |
+
say -f $2
|
| 17 |
|
| 18 |
# Eleven Labs
|
| 19 |
+
elif installed python3 && \
|
| 20 |
+
python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
|
| 21 |
+
installed ffplay; then
|
| 22 |
+
# It's possible to use the API for free with limited number of characters.
|
| 23 |
+
# To increase this limit register to https://beta.elevenlabs.io to get an api key
|
| 24 |
+
# and paste it after 'ELEVEN_API_KEY='
|
| 25 |
+
# Keep the line commented to use the free version without api key
|
| 26 |
+
#export ELEVEN_API_KEY=your_api_key
|
| 27 |
+
wd=$(dirname $0)
|
| 28 |
+
script=$wd/eleven-labs.py
|
| 29 |
+
python3 $script -q -p -v $1 $2 >/dev/null 2>&1
|
| 30 |
+
|
| 31 |
+
# Uncomment to keep the audio file
|
| 32 |
+
#python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
|
| 33 |
+
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
|
| 34 |
+
|
| 35 |
+
else
|
| 36 |
+
echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
|
| 37 |
+
echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
|
| 38 |
+
echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
|
| 39 |
+
echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
|
| 40 |
+
fi
|
|
@@ -1 +1 @@
|
|
| 1 |
-
@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2
|
|
|
|
| 1 |
+
@powershell -ExecutionPolicy Bypass -F examples\talk-llama\speak.ps1 %1 %2
|
|
@@ -1,12 +1,14 @@
|
|
| 1 |
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
|
| 2 |
param(
|
| 3 |
-
|
| 4 |
-
[Parameter(Mandatory=$true)][string]$
|
| 5 |
-
[Parameter(Mandatory=$true)][string]$text
|
| 6 |
)
|
| 7 |
|
| 8 |
Add-Type -AssemblyName System.Speech;
|
| 9 |
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
|
| 10 |
-
$speak.
|
|
|
|
|
|
|
| 11 |
$speak.Rate="0";
|
|
|
|
| 12 |
$speak.Speak($text);
|
|
|
|
| 1 |
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
|
| 2 |
param(
|
| 3 |
+
[Parameter(Mandatory=$true)][int]$voicenum,
|
| 4 |
+
[Parameter(Mandatory=$true)][string]$textfile
|
|
|
|
| 5 |
)
|
| 6 |
|
| 7 |
Add-Type -AssemblyName System.Speech;
|
| 8 |
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
|
| 9 |
+
$voiceoptions = $speak.GetInstalledVoices("en-US");
|
| 10 |
+
$voice = $voiceoptions[$voicenum % $voiceoptions.count];
|
| 11 |
+
$speak.SelectVoice($voice.VoiceInfo.Name);
|
| 12 |
$speak.Rate="0";
|
| 13 |
+
$text = Get-Content -Path $textfile;
|
| 14 |
$speak.Speak($text);
|
|
@@ -75,6 +75,7 @@ struct whisper_params {
|
|
| 75 |
std::string model_wsp = "models/ggml-base.en.bin";
|
| 76 |
std::string model_llama = "models/ggml-llama-7B.bin";
|
| 77 |
std::string speak = "./examples/talk-llama/speak";
|
|
|
|
| 78 |
std::string prompt = "";
|
| 79 |
std::string fname_out;
|
| 80 |
std::string path_session = ""; // path to file for saving/loading model eval state
|
|
@@ -113,6 +114,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 113 |
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
|
| 114 |
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
|
| 115 |
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
|
|
|
|
| 116 |
else if (arg == "--prompt-file") {
|
| 117 |
std::ifstream file(argv[++i]);
|
| 118 |
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
|
@@ -160,6 +162,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 160 |
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
| 161 |
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
|
| 162 |
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
|
|
|
| 163 |
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
|
| 164 |
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
|
| 165 |
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
|
@@ -546,10 +549,7 @@ int main(int argc, char ** argv) {
|
|
| 546 |
|
| 547 |
// optionally give audio feedback that the current text is being processed
|
| 548 |
if (!params.heard_ok.empty()) {
|
| 549 |
-
|
| 550 |
-
if (ret != 0) {
|
| 551 |
-
fprintf(stderr, "%s: failed to speak\n", __func__);
|
| 552 |
-
}
|
| 553 |
}
|
| 554 |
|
| 555 |
// remove text between brackets using regex
|
|
@@ -748,11 +748,7 @@ int main(int argc, char ** argv) {
|
|
| 748 |
}
|
| 749 |
}
|
| 750 |
|
| 751 |
-
|
| 752 |
-
int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
|
| 753 |
-
if (ret != 0) {
|
| 754 |
-
fprintf(stderr, "%s: failed to speak\n", __func__);
|
| 755 |
-
}
|
| 756 |
|
| 757 |
audio.clear();
|
| 758 |
}
|
|
|
|
| 75 |
std::string model_wsp = "models/ggml-base.en.bin";
|
| 76 |
std::string model_llama = "models/ggml-llama-7B.bin";
|
| 77 |
std::string speak = "./examples/talk-llama/speak";
|
| 78 |
+
std::string speak_file = "./examples/talk-llama/to_speak.txt";
|
| 79 |
std::string prompt = "";
|
| 80 |
std::string fname_out;
|
| 81 |
std::string path_session = ""; // path to file for saving/loading model eval state
|
|
|
|
| 114 |
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
|
| 115 |
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
|
| 116 |
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
|
| 117 |
+
else if (arg == "-sf" || arg == "--speak-file") { params.speak_file = argv[++i]; }
|
| 118 |
else if (arg == "--prompt-file") {
|
| 119 |
std::ifstream file(argv[++i]);
|
| 120 |
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
|
|
|
| 162 |
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
| 163 |
fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
|
| 164 |
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
| 165 |
+
fprintf(stderr, " -sf FILE, --speak-file [%-7s] file to pass to TTS\n", params.speak_file.c_str());
|
| 166 |
fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
|
| 167 |
fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
|
| 168 |
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
|
|
|
| 549 |
|
| 550 |
// optionally give audio feedback that the current text is being processed
|
| 551 |
if (!params.heard_ok.empty()) {
|
| 552 |
+
speak_with_file(params.speak, params.heard_ok, params.speak_file, voice_id);
|
|
|
|
|
|
|
|
|
|
| 553 |
}
|
| 554 |
|
| 555 |
// remove text between brackets using regex
|
|
|
|
| 748 |
}
|
| 749 |
}
|
| 750 |
|
| 751 |
+
speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
|
| 753 |
audio.clear();
|
| 754 |
}
|
|
@@ -1 +1,2 @@
|
|
| 1 |
audio.mp3
|
|
|
|
|
|
| 1 |
audio.mp3
|
| 2 |
+
to_speak.txt
|
|
@@ -1,20 +1,80 @@
|
|
| 1 |
import sys
|
| 2 |
-
import
|
|
|
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
# Generate the TTS
|
| 14 |
audio = generate(
|
| 15 |
-
|
| 16 |
-
|
| 17 |
)
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
| 1 |
import sys
|
| 2 |
+
import argparse
|
| 3 |
+
import textwrap
|
| 4 |
|
| 5 |
+
parser = argparse.ArgumentParser(add_help=False,
|
| 6 |
+
formatter_class=argparse.RawTextHelpFormatter)
|
| 7 |
+
parser.add_argument("-q", "--quick", action="store_true",
|
| 8 |
+
help="skip checking the required library")
|
| 9 |
+
|
| 10 |
+
modes = parser.add_argument_group("action")
|
| 11 |
+
modes.add_argument("inputfile", metavar="TEXTFILE",
|
| 12 |
+
nargs='?', type=argparse.FileType(), default=sys.stdin,
|
| 13 |
+
help="read the text file (default: stdin)")
|
| 14 |
+
modes.add_argument("-l", "--list", action="store_true",
|
| 15 |
+
help="show the list of voices and exit")
|
| 16 |
+
modes.add_argument("-h", "--help", action="help",
|
| 17 |
+
help="show this help and exit")
|
| 18 |
+
|
| 19 |
+
selopts = parser.add_argument_group("voice selection")
|
| 20 |
+
selmodes = selopts.add_mutually_exclusive_group()
|
| 21 |
+
selmodes.add_argument("-n", "--name",
|
| 22 |
+
default="Arnold",
|
| 23 |
+
help="get a voice object by name (default: Arnold)")
|
| 24 |
+
selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
|
| 25 |
+
help="get a voice object by number (see --list)")
|
| 26 |
+
selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
|
| 27 |
+
default=["use case=narration"],
|
| 28 |
+
help=textwrap.dedent('''\
|
| 29 |
+
filter voices by labels (default: "use case=narration")
|
| 30 |
+
this option can be used multiple times
|
| 31 |
+
filtering will be disabled if the first -f has no "=" (e.g. -f "any")
|
| 32 |
+
'''))
|
| 33 |
+
|
| 34 |
+
outmodes = parser.add_argument_group("output")
|
| 35 |
+
outgroup = outmodes.add_mutually_exclusive_group()
|
| 36 |
+
outgroup.add_argument("-s", "--save", metavar="FILE",
|
| 37 |
+
default="audio.mp3",
|
| 38 |
+
help="save the TTS to a file (default: audio.mp3)")
|
| 39 |
+
outgroup.add_argument("-p", "--play", action="store_true",
|
| 40 |
+
help="play the TTS with ffplay")
|
| 41 |
+
|
| 42 |
+
args = parser.parse_args()
|
| 43 |
|
| 44 |
+
if not args.quick:
|
| 45 |
+
import importlib.util
|
| 46 |
+
if importlib.util.find_spec("elevenlabs") is None:
|
| 47 |
+
print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
|
| 48 |
+
sys.exit()
|
| 49 |
|
| 50 |
+
from elevenlabs import voices, generate, play, save
|
| 51 |
+
|
| 52 |
+
if args.filter and "=" in args.filter[0]:
|
| 53 |
+
voicelist = voices()
|
| 54 |
+
for f in args.filter:
|
| 55 |
+
label, value = f.split("=")
|
| 56 |
+
voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
|
| 57 |
+
voicelist = list(voicelist)
|
| 58 |
+
else:
|
| 59 |
+
voicelist = list(voices())
|
| 60 |
+
|
| 61 |
+
if args.list:
|
| 62 |
+
for i, v in enumerate(voicelist):
|
| 63 |
+
print(str(i) + ": " + v.name + " " + str(v.labels))
|
| 64 |
+
sys.exit()
|
| 65 |
+
|
| 66 |
+
if args.voice:
|
| 67 |
+
voice = voicelist[args.voice % len(voicelist)]
|
| 68 |
+
else:
|
| 69 |
+
voice = args.name
|
| 70 |
+
# if -n should consult -f, use the following
|
| 71 |
+
#voice = next(x for x in voicelist if x.name == args.name)
|
| 72 |
|
|
|
|
| 73 |
audio = generate(
|
| 74 |
+
text=str(args.inputfile.read()),
|
| 75 |
+
voice=voice
|
| 76 |
)
|
| 77 |
+
if args.play:
|
| 78 |
+
play(audio)
|
| 79 |
+
else:
|
| 80 |
+
save(audio, args.save)
|
|
@@ -1,24 +1,40 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
# Usage:
|
| 4 |
-
# speak
|
| 5 |
|
| 6 |
-
|
| 7 |
-
# Mac OS: brew install espeak
|
| 8 |
-
# Linux: apt-get install espeak
|
| 9 |
-
#
|
| 10 |
-
#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Eleven Labs
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
#
|
| 20 |
-
#
|
| 21 |
-
#
|
| 22 |
-
#
|
| 23 |
-
#
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
# Usage:
|
| 4 |
+
# speak <voice_id> <textfile>
|
| 5 |
|
| 6 |
+
function installed() { command -v $1 >/dev/null 2>&1; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
if installed espeak; then
|
| 9 |
+
espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
|
| 10 |
+
|
| 11 |
+
elif installed piper && installed aplay; then
|
| 12 |
+
cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
|
| 13 |
+
|
| 14 |
+
# for Mac
|
| 15 |
+
elif installed say; then
|
| 16 |
+
say -f $2
|
| 17 |
|
| 18 |
# Eleven Labs
|
| 19 |
+
elif installed python3 && \
|
| 20 |
+
python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
|
| 21 |
+
installed ffplay; then
|
| 22 |
+
# It's possible to use the API for free with limited number of characters.
|
| 23 |
+
# To increase this limit register to https://beta.elevenlabs.io to get an api key
|
| 24 |
+
# and paste it after 'ELEVEN_API_KEY='
|
| 25 |
+
# Keep the line commented to use the free version without api key
|
| 26 |
+
#export ELEVEN_API_KEY=your_api_key
|
| 27 |
+
wd=$(dirname $0)
|
| 28 |
+
script=$wd/eleven-labs.py
|
| 29 |
+
python3 $script -q -p -v $1 $2 >/dev/null 2>&1
|
| 30 |
+
|
| 31 |
+
# Uncomment to keep the audio file
|
| 32 |
+
#python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
|
| 33 |
+
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
|
| 34 |
+
|
| 35 |
+
else
|
| 36 |
+
echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
|
| 37 |
+
echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
|
| 38 |
+
echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
|
| 39 |
+
echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
|
| 40 |
+
fi
|
|
@@ -1,12 +1,14 @@
|
|
| 1 |
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
|
| 2 |
param(
|
| 3 |
-
|
| 4 |
-
[Parameter(Mandatory=$true)][string]$
|
| 5 |
-
[Parameter(Mandatory=$true)][string]$text
|
| 6 |
)
|
| 7 |
|
| 8 |
Add-Type -AssemblyName System.Speech;
|
| 9 |
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
|
| 10 |
-
$speak.
|
|
|
|
|
|
|
| 11 |
$speak.Rate="0";
|
|
|
|
| 12 |
$speak.Speak($text);
|
|
|
|
| 1 |
# Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
|
| 2 |
param(
|
| 3 |
+
[Parameter(Mandatory=$true)][int]$voicenum,
|
| 4 |
+
[Parameter(Mandatory=$true)][string]$textfile
|
|
|
|
| 5 |
)
|
| 6 |
|
| 7 |
Add-Type -AssemblyName System.Speech;
|
| 8 |
$speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
|
| 9 |
+
$voiceoptions = $speak.GetInstalledVoices("en-US");
|
| 10 |
+
$voice = $voiceoptions[$voicenum % $voiceoptions.count];
|
| 11 |
+
$speak.SelectVoice($voice.VoiceInfo.Name);
|
| 12 |
$speak.Rate="0";
|
| 13 |
+
$text = Get-Content -Path $textfile;
|
| 14 |
$speak.Speak($text);
|
|
@@ -38,6 +38,7 @@ struct whisper_params {
|
|
| 38 |
std::string model_wsp = "models/ggml-base.en.bin";
|
| 39 |
std::string model_gpt = "models/ggml-gpt-2-117M.bin";
|
| 40 |
std::string speak = "./examples/talk/speak";
|
|
|
|
| 41 |
std::string fname_out;
|
| 42 |
};
|
| 43 |
|
|
@@ -68,6 +69,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 68 |
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
|
| 69 |
else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; }
|
| 70 |
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
|
|
|
|
| 71 |
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
| 72 |
else {
|
| 73 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
|
@@ -102,6 +104,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 102 |
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
| 103 |
fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str());
|
| 104 |
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
|
|
|
| 105 |
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
| 106 |
fprintf(stderr, "\n");
|
| 107 |
}
|
|
@@ -316,7 +319,7 @@ int main(int argc, char ** argv) {
|
|
| 316 |
std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
|
| 317 |
|
| 318 |
text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
|
| 319 |
-
text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
| 320 |
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
|
| 321 |
|
| 322 |
// remove first 2 lines of base prompt
|
|
@@ -354,10 +357,7 @@ int main(int argc, char ** argv) {
|
|
| 354 |
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
| 355 |
|
| 356 |
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
|
| 357 |
-
|
| 358 |
-
if (ret != 0) {
|
| 359 |
-
fprintf(stderr, "%s: system() failed!\n", __func__);
|
| 360 |
-
}
|
| 361 |
|
| 362 |
audio.clear();
|
| 363 |
|
|
|
|
| 38 |
std::string model_wsp = "models/ggml-base.en.bin";
|
| 39 |
std::string model_gpt = "models/ggml-gpt-2-117M.bin";
|
| 40 |
std::string speak = "./examples/talk/speak";
|
| 41 |
+
std::string speak_file= "./examples/talk/to_speak.txt";
|
| 42 |
std::string fname_out;
|
| 43 |
};
|
| 44 |
|
|
|
|
| 69 |
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
|
| 70 |
else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; }
|
| 71 |
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
|
| 72 |
+
else if (arg == "-sf" || arg == "--speak_file") { params.speak_file = argv[++i]; }
|
| 73 |
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
| 74 |
else {
|
| 75 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
|
|
|
| 104 |
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
|
| 105 |
fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str());
|
| 106 |
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
|
| 107 |
+
fprintf(stderr, " -sf FILE, --speak_file [%-7s] file to pass to TTS\n", params.speak_file.c_str());
|
| 108 |
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
| 109 |
fprintf(stderr, "\n");
|
| 110 |
}
|
|
|
|
| 319 |
std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
|
| 320 |
|
| 321 |
text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
|
| 322 |
+
//text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
|
| 323 |
text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
|
| 324 |
|
| 325 |
// remove first 2 lines of base prompt
|
|
|
|
| 357 |
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
| 358 |
|
| 359 |
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
|
| 360 |
+
speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
|
|
|
|
|
|
|
|
|
|
| 361 |
|
| 362 |
audio.clear();
|
| 363 |
|