Tamotsu Takahashi commited on
Commit
3fd8b4d
·
unverified ·
1 Parent(s): 9a168fc

talk, talk-llama : pass text_to_speak as a file (#1865)

Browse files

* talk-llama: pass file instead of arg

it is too hard to quote text in a portable way

* talk-llama: pass heard_ok as a file

* talk-llama: let eleven-labs.py accept options

Options: -v voice, -s savefile, -p (--play)

* talk-llama: check installed commands in "speak"

Pass "-q" to eleven-labs.py to skip checking whether elevenlabs is installed

* talk-llama: pass voice_id again

in order to sync talk with talk-llama

* talk: sync with talk-llama

Passing text_to_speak as a file is safer and more portable
cf. https://stackoverflow.com/a/59036879/45375

* talk and talk-llama: get all installed voices in speak.ps1

* talk and talk-llama: get voices from api

* talk and talk-llama: add more options to eleven-labs.py

and remove DEFAULT_VOICE because it is deprecated (https://www.reddit.com/r/ElevenLabs/comments/1830abt/what_happened_to_bella/)

```
usage: eleven-labs.py [-q] [-l] [-h] [-n NAME | -v NUMBER] [-f KEY=VAL] [-s FILE | -p] [TEXTFILE]

options:
-q, --quick skip checking the required library

action:
TEXTFILE read the text file (default: stdin)
-l, --list show the list of voices and exit
-h, --help show this help and exit

voice selection:
-n NAME, --name NAME get a voice object by name (default: Arnold)
-v NUMBER, --voice NUMBER
get a voice object by number (see --list)
-f KEY=VAL, --filter KEY=VAL
filter voices by labels (default: "use case=narration")
this option can be used multiple times
filtering will be disabled if the first -f has no "=" (e.g. -f "any")

output:
-s FILE, --save FILE save the TTS to a file (default: audio.mp3)
-p, --play play the TTS with ffplay
```

* examples: add speak_with_file()

as suggested in the review

* talk and talk-llama: ignore to_speak.txt

examples/common.cpp CHANGED
@@ -863,3 +863,21 @@ bool is_file_exist(const char *fileName)
863
  std::ifstream infile(fileName);
864
  return infile.good();
865
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
  std::ifstream infile(fileName);
864
  return infile.good();
865
  }
866
+
867
+ bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id)
868
+ {
869
+ std::ofstream speak_file(path.c_str());
870
+ if (speak_file.fail()) {
871
+ fprintf(stderr, "%s: failed to open speak_file\n", __func__);
872
+ return false;
873
+ } else {
874
+ speak_file.write(text.c_str(), text.size());
875
+ speak_file.close();
876
+ int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
877
+ if (ret != 0) {
878
+ fprintf(stderr, "%s: failed to speak\n", __func__);
879
+ return false;
880
+ }
881
+ }
882
+ return true;
883
+ }
examples/common.h CHANGED
@@ -306,3 +306,6 @@ int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate);
306
 
307
  // check if file exists using ifstream
308
  bool is_file_exist(const char *fileName);
 
 
 
 
306
 
307
  // check if file exists using ifstream
308
  bool is_file_exist(const char *fileName);
309
+
310
+ // write text to file, and call system("command voice_id file")
311
+ bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);
examples/talk-llama/.gitignore CHANGED
@@ -1 +1,2 @@
1
  audio.mp3
 
 
1
  audio.mp3
2
+ to_speak.txt
examples/talk-llama/eleven-labs.py CHANGED
@@ -1,20 +1,80 @@
1
  import sys
2
- import importlib.util
 
3
 
4
- if importlib.util.find_spec("elevenlabs") is None:
5
- print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
6
- sys.exit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- from elevenlabs import generate, play, save
 
 
 
 
9
 
10
- # Get a Voice object, by name or UUID
11
- voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Generate the TTS
14
  audio = generate(
15
- text=str(sys.argv[2:]),
16
- voice=voice
17
  )
18
-
19
- # Save the TTS to a file
20
- save(audio, "audio.mp3")
 
 
1
  import sys
2
+ import argparse
3
+ import textwrap
4
 
5
+ parser = argparse.ArgumentParser(add_help=False,
6
+ formatter_class=argparse.RawTextHelpFormatter)
7
+ parser.add_argument("-q", "--quick", action="store_true",
8
+ help="skip checking the required library")
9
+
10
+ modes = parser.add_argument_group("action")
11
+ modes.add_argument("inputfile", metavar="TEXTFILE",
12
+ nargs='?', type=argparse.FileType(), default=sys.stdin,
13
+ help="read the text file (default: stdin)")
14
+ modes.add_argument("-l", "--list", action="store_true",
15
+ help="show the list of voices and exit")
16
+ modes.add_argument("-h", "--help", action="help",
17
+ help="show this help and exit")
18
+
19
+ selopts = parser.add_argument_group("voice selection")
20
+ selmodes = selopts.add_mutually_exclusive_group()
21
+ selmodes.add_argument("-n", "--name",
22
+ default="Arnold",
23
+ help="get a voice object by name (default: Arnold)")
24
+ selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
25
+ help="get a voice object by number (see --list)")
26
+ selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
27
+ default=["use case=narration"],
28
+ help=textwrap.dedent('''\
29
+ filter voices by labels (default: "use case=narration")
30
+ this option can be used multiple times
31
+ filtering will be disabled if the first -f has no "=" (e.g. -f "any")
32
+ '''))
33
+
34
+ outmodes = parser.add_argument_group("output")
35
+ outgroup = outmodes.add_mutually_exclusive_group()
36
+ outgroup.add_argument("-s", "--save", metavar="FILE",
37
+ default="audio.mp3",
38
+ help="save the TTS to a file (default: audio.mp3)")
39
+ outgroup.add_argument("-p", "--play", action="store_true",
40
+ help="play the TTS with ffplay")
41
+
42
+ args = parser.parse_args()
43
 
44
+ if not args.quick:
45
+ import importlib.util
46
+ if importlib.util.find_spec("elevenlabs") is None:
47
+ print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
48
+ sys.exit()
49
 
50
+ from elevenlabs import voices, generate, play, save
51
+
52
+ if args.filter and "=" in args.filter[0]:
53
+ voicelist = voices()
54
+ for f in args.filter:
55
+ label, value = f.split("=")
56
+ voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
57
+ voicelist = list(voicelist)
58
+ else:
59
+ voicelist = list(voices())
60
+
61
+ if args.list:
62
+ for i, v in enumerate(voicelist):
63
+ print(str(i) + ": " + v.name + " " + str(v.labels))
64
+ sys.exit()
65
+
66
+ if args.voice:
67
+ voice = voicelist[args.voice % len(voicelist)]
68
+ else:
69
+ voice = args.name
70
+ # if -n should consult -f, use the following
71
+ #voice = next(x for x in voicelist if x.name == args.name)
72
 
 
73
  audio = generate(
74
+ text=str(args.inputfile.read()),
75
+ voice=voice
76
  )
77
+ if args.play:
78
+ play(audio)
79
+ else:
80
+ save(audio, args.save)
examples/talk-llama/speak CHANGED
@@ -1,32 +1,40 @@
1
  #!/bin/bash
2
 
3
  # Usage:
4
- # speak.sh <voice_id> <text-to-speak>
5
-
6
- # espeak
7
- # Mac OS: brew install espeak
8
- # Linux: apt-get install espeak
9
- #
10
- #espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
11
-
12
- # piper
13
- #
14
- # https://github.com/rhasspy/piper
15
- #
16
- # Tested with Linux:
17
- #
18
- #echo "$2" | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
19
 
20
  # for Mac
21
- say "$2"
 
22
 
23
  # Eleven Labs
24
- # To use it, install the elevenlabs module from pip (pip install elevenlabs)
25
- # It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
26
- #Keep the line commented to use the free version whitout api key
27
- #
28
- #export ELEVEN_API_KEY=your_api_key
29
- #wd=$(dirname $0)
30
- #script=$wd/eleven-labs.py
31
- #python3 $script $1 "$2" >/dev/null 2>&1
32
- #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/bin/bash
2
 
3
  # Usage:
4
+ # speak <voice_id> <textfile>
5
+
6
+ function installed() { command -v $1 >/dev/null 2>&1; }
7
+
8
+ if installed espeak; then
9
+ espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
10
+
11
+ elif installed piper && installed aplay; then
12
+ cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
 
 
 
 
 
 
13
 
14
  # for Mac
15
+ elif installed say; then
16
+ say -f $2
17
 
18
  # Eleven Labs
19
+ elif installed python3 && \
20
+ python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
21
+ installed ffplay; then
22
+ # It's possible to use the API for free with limited number of characters.
23
+ # To increase this limit register to https://beta.elevenlabs.io to get an api key
24
+ # and paste it after 'ELEVEN_API_KEY='
25
+ # Keep the line commented to use the free version without api key
26
+ #export ELEVEN_API_KEY=your_api_key
27
+ wd=$(dirname $0)
28
+ script=$wd/eleven-labs.py
29
+ python3 $script -q -p -v $1 $2 >/dev/null 2>&1
30
+
31
+ # Uncomment to keep the audio file
32
+ #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
33
+ #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
34
+
35
+ else
36
+ echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
37
+ echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
38
+ echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
39
+ echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
40
+ fi
examples/talk-llama/speak.bat CHANGED
@@ -1 +1 @@
1
- @powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2
 
1
+ @powershell -ExecutionPolicy Bypass -F examples\talk-llama\speak.ps1 %1 %2
examples/talk-llama/speak.ps1 CHANGED
@@ -1,12 +1,14 @@
1
  # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
2
  param(
3
- # voice options are David or Zira
4
- [Parameter(Mandatory=$true)][string]$voice,
5
- [Parameter(Mandatory=$true)][string]$text
6
  )
7
 
8
  Add-Type -AssemblyName System.Speech;
9
  $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
10
- $speak.SelectVoice("Microsoft $voice Desktop");
 
 
11
  $speak.Rate="0";
 
12
  $speak.Speak($text);
 
1
  # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
2
  param(
3
+ [Parameter(Mandatory=$true)][int]$voicenum,
4
+ [Parameter(Mandatory=$true)][string]$textfile
 
5
  )
6
 
7
  Add-Type -AssemblyName System.Speech;
8
  $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
9
+ $voiceoptions = $speak.GetInstalledVoices("en-US");
10
+ $voice = $voiceoptions[$voicenum % $voiceoptions.count];
11
+ $speak.SelectVoice($voice.VoiceInfo.Name);
12
  $speak.Rate="0";
13
+ $text = Get-Content -Path $textfile;
14
  $speak.Speak($text);
examples/talk-llama/talk-llama.cpp CHANGED
@@ -75,6 +75,7 @@ struct whisper_params {
75
  std::string model_wsp = "models/ggml-base.en.bin";
76
  std::string model_llama = "models/ggml-llama-7B.bin";
77
  std::string speak = "./examples/talk-llama/speak";
 
78
  std::string prompt = "";
79
  std::string fname_out;
80
  std::string path_session = ""; // path to file for saving/loading model eval state
@@ -113,6 +114,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
113
  else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
114
  else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
115
  else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
 
116
  else if (arg == "--prompt-file") {
117
  std::ifstream file(argv[++i]);
118
  std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
@@ -160,6 +162,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
160
  fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
161
  fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
162
  fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
 
163
  fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
164
  fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
165
  fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
@@ -546,10 +549,7 @@ int main(int argc, char ** argv) {
546
 
547
  // optionally give audio feedback that the current text is being processed
548
  if (!params.heard_ok.empty()) {
549
- int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + params.heard_ok + "'").c_str());
550
- if (ret != 0) {
551
- fprintf(stderr, "%s: failed to speak\n", __func__);
552
- }
553
  }
554
 
555
  // remove text between brackets using regex
@@ -748,11 +748,7 @@ int main(int argc, char ** argv) {
748
  }
749
  }
750
 
751
- text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
752
- int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
753
- if (ret != 0) {
754
- fprintf(stderr, "%s: failed to speak\n", __func__);
755
- }
756
 
757
  audio.clear();
758
  }
 
75
  std::string model_wsp = "models/ggml-base.en.bin";
76
  std::string model_llama = "models/ggml-llama-7B.bin";
77
  std::string speak = "./examples/talk-llama/speak";
78
+ std::string speak_file = "./examples/talk-llama/to_speak.txt";
79
  std::string prompt = "";
80
  std::string fname_out;
81
  std::string path_session = ""; // path to file for saving/loading model eval state
 
114
  else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
115
  else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
116
  else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
117
+ else if (arg == "-sf" || arg == "--speak-file") { params.speak_file = argv[++i]; }
118
  else if (arg == "--prompt-file") {
119
  std::ifstream file(argv[++i]);
120
  std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
 
162
  fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
163
  fprintf(stderr, " -ml FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
164
  fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
165
+ fprintf(stderr, " -sf FILE, --speak-file [%-7s] file to pass to TTS\n", params.speak_file.c_str());
166
  fprintf(stderr, " --prompt-file FNAME [%-7s] file with custom prompt to start dialog\n", "");
167
  fprintf(stderr, " --session FNAME file to cache model state in (may be large!) (default: none)\n");
168
  fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
 
549
 
550
  // optionally give audio feedback that the current text is being processed
551
  if (!params.heard_ok.empty()) {
552
+ speak_with_file(params.speak, params.heard_ok, params.speak_file, voice_id);
 
 
 
553
  }
554
 
555
  // remove text between brackets using regex
 
748
  }
749
  }
750
 
751
+ speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
 
 
 
 
752
 
753
  audio.clear();
754
  }
examples/talk/.gitignore CHANGED
@@ -1 +1,2 @@
1
  audio.mp3
 
 
1
  audio.mp3
2
+ to_speak.txt
examples/talk/eleven-labs.py CHANGED
@@ -1,20 +1,80 @@
1
  import sys
2
- import importlib.util
 
3
 
4
- if importlib.util.find_spec("elevenlabs") is None:
5
- print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
6
- sys.exit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- from elevenlabs import generate, play, save
 
 
 
 
9
 
10
- # Get a Voice object, by name or UUID
11
- voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # Generate the TTS
14
  audio = generate(
15
- text=str(sys.argv[2:]),
16
- voice=voice
17
  )
18
-
19
- # Save the TTS to a file
20
- save(audio, "audio.mp3")
 
 
1
  import sys
2
+ import argparse
3
+ import textwrap
4
 
5
+ parser = argparse.ArgumentParser(add_help=False,
6
+ formatter_class=argparse.RawTextHelpFormatter)
7
+ parser.add_argument("-q", "--quick", action="store_true",
8
+ help="skip checking the required library")
9
+
10
+ modes = parser.add_argument_group("action")
11
+ modes.add_argument("inputfile", metavar="TEXTFILE",
12
+ nargs='?', type=argparse.FileType(), default=sys.stdin,
13
+ help="read the text file (default: stdin)")
14
+ modes.add_argument("-l", "--list", action="store_true",
15
+ help="show the list of voices and exit")
16
+ modes.add_argument("-h", "--help", action="help",
17
+ help="show this help and exit")
18
+
19
+ selopts = parser.add_argument_group("voice selection")
20
+ selmodes = selopts.add_mutually_exclusive_group()
21
+ selmodes.add_argument("-n", "--name",
22
+ default="Arnold",
23
+ help="get a voice object by name (default: Arnold)")
24
+ selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
25
+ help="get a voice object by number (see --list)")
26
+ selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
27
+ default=["use case=narration"],
28
+ help=textwrap.dedent('''\
29
+ filter voices by labels (default: "use case=narration")
30
+ this option can be used multiple times
31
+ filtering will be disabled if the first -f has no "=" (e.g. -f "any")
32
+ '''))
33
+
34
+ outmodes = parser.add_argument_group("output")
35
+ outgroup = outmodes.add_mutually_exclusive_group()
36
+ outgroup.add_argument("-s", "--save", metavar="FILE",
37
+ default="audio.mp3",
38
+ help="save the TTS to a file (default: audio.mp3)")
39
+ outgroup.add_argument("-p", "--play", action="store_true",
40
+ help="play the TTS with ffplay")
41
+
42
+ args = parser.parse_args()
43
 
44
+ if not args.quick:
45
+ import importlib.util
46
+ if importlib.util.find_spec("elevenlabs") is None:
47
+ print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
48
+ sys.exit()
49
 
50
+ from elevenlabs import voices, generate, play, save
51
+
52
+ if args.filter and "=" in args.filter[0]:
53
+ voicelist = voices()
54
+ for f in args.filter:
55
+ label, value = f.split("=")
56
+ voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
57
+ voicelist = list(voicelist)
58
+ else:
59
+ voicelist = list(voices())
60
+
61
+ if args.list:
62
+ for i, v in enumerate(voicelist):
63
+ print(str(i) + ": " + v.name + " " + str(v.labels))
64
+ sys.exit()
65
+
66
+ if args.voice:
67
+ voice = voicelist[args.voice % len(voicelist)]
68
+ else:
69
+ voice = args.name
70
+ # if -n should consult -f, use the following
71
+ #voice = next(x for x in voicelist if x.name == args.name)
72
 
 
73
  audio = generate(
74
+ text=str(args.inputfile.read()),
75
+ voice=voice
76
  )
77
+ if args.play:
78
+ play(audio)
79
+ else:
80
+ save(audio, args.save)
examples/talk/speak CHANGED
@@ -1,24 +1,40 @@
1
  #!/bin/bash
2
 
3
  # Usage:
4
- # speak.sh <voice_id> <text-to-speak>
5
 
6
- # espeak
7
- # Mac OS: brew install espeak
8
- # Linux: apt-get install espeak
9
- #
10
- #espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
11
 
12
- # Mac OS "say" command
13
- say "$2"
 
 
 
 
 
 
 
14
 
15
  # Eleven Labs
16
- # To use it, install the elevenlabs module from pip (pip install elevenlabs)
17
- # It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
18
- #Keep the line commented to use the free version without api key
19
- #
20
- #export ELEVEN_API_KEY=your_api_key
21
- #wd=$(dirname $0)
22
- #script=$wd/eleven-labs.py
23
- #python3 $script $1 "$2"
24
- #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  #!/bin/bash
2
 
3
  # Usage:
4
+ # speak <voice_id> <textfile>
5
 
6
+ function installed() { command -v $1 >/dev/null 2>&1; }
 
 
 
 
7
 
8
+ if installed espeak; then
9
+ espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
10
+
11
+ elif installed piper && installed aplay; then
12
+ cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
13
+
14
+ # for Mac
15
+ elif installed say; then
16
+ say -f $2
17
 
18
  # Eleven Labs
19
+ elif installed python3 && \
20
+ python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
21
+ installed ffplay; then
22
+ # It's possible to use the API for free with limited number of characters.
23
+ # To increase this limit register to https://beta.elevenlabs.io to get an api key
24
+ # and paste it after 'ELEVEN_API_KEY='
25
+ # Keep the line commented to use the free version without api key
26
+ #export ELEVEN_API_KEY=your_api_key
27
+ wd=$(dirname $0)
28
+ script=$wd/eleven-labs.py
29
+ python3 $script -q -p -v $1 $2 >/dev/null 2>&1
30
+
31
+ # Uncomment to keep the audio file
32
+ #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
33
+ #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
34
+
35
+ else
36
+ echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
37
+ echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
38
+ echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
39
+ echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
40
+ fi
examples/talk/speak.ps1 CHANGED
@@ -1,12 +1,14 @@
1
  # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
2
  param(
3
- # voice options are David or Zira
4
- [Parameter(Mandatory=$true)][string]$voice,
5
- [Parameter(Mandatory=$true)][string]$text
6
  )
7
 
8
  Add-Type -AssemblyName System.Speech;
9
  $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
10
- $speak.SelectVoice("Microsoft $voice Desktop");
 
 
11
  $speak.Rate="0";
 
12
  $speak.Speak($text);
 
1
  # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
2
  param(
3
+ [Parameter(Mandatory=$true)][int]$voicenum,
4
+ [Parameter(Mandatory=$true)][string]$textfile
 
5
  )
6
 
7
  Add-Type -AssemblyName System.Speech;
8
  $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
9
+ $voiceoptions = $speak.GetInstalledVoices("en-US");
10
+ $voice = $voiceoptions[$voicenum % $voiceoptions.count];
11
+ $speak.SelectVoice($voice.VoiceInfo.Name);
12
  $speak.Rate="0";
13
+ $text = Get-Content -Path $textfile;
14
  $speak.Speak($text);
examples/talk/talk.cpp CHANGED
@@ -38,6 +38,7 @@ struct whisper_params {
38
  std::string model_wsp = "models/ggml-base.en.bin";
39
  std::string model_gpt = "models/ggml-gpt-2-117M.bin";
40
  std::string speak = "./examples/talk/speak";
 
41
  std::string fname_out;
42
  };
43
 
@@ -68,6 +69,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
68
  else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
69
  else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; }
70
  else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
 
71
  else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
72
  else {
73
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -102,6 +104,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
102
  fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
103
  fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str());
104
  fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
 
105
  fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
106
  fprintf(stderr, "\n");
107
  }
@@ -316,7 +319,7 @@ int main(int argc, char ** argv) {
316
  std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
317
 
318
  text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
319
- text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
320
  text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
321
 
322
  // remove first 2 lines of base prompt
@@ -354,10 +357,7 @@ int main(int argc, char ** argv) {
354
  gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
355
 
356
  text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
357
- int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
358
- if (ret != 0) {
359
- fprintf(stderr, "%s: system() failed!\n", __func__);
360
- }
361
 
362
  audio.clear();
363
 
 
38
  std::string model_wsp = "models/ggml-base.en.bin";
39
  std::string model_gpt = "models/ggml-gpt-2-117M.bin";
40
  std::string speak = "./examples/talk/speak";
41
+ std::string speak_file= "./examples/talk/to_speak.txt";
42
  std::string fname_out;
43
  };
44
 
 
69
  else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
70
  else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; }
71
  else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
72
+ else if (arg == "-sf" || arg == "--speak_file") { params.speak_file = argv[++i]; }
73
  else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
74
  else {
75
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
 
104
  fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
105
  fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str());
106
  fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
107
+ fprintf(stderr, " -sf FILE, --speak_file [%-7s] file to pass to TTS\n", params.speak_file.c_str());
108
  fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
109
  fprintf(stderr, "\n");
110
  }
 
319
  std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
320
 
321
  text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
322
+ //text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
323
  text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
324
 
325
  // remove first 2 lines of base prompt
 
357
  gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
358
 
359
  text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
360
+ speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
 
 
 
361
 
362
  audio.clear();
363