venkr commited on
Commit
adb49fb
·
unverified ·
1 Parent(s): 6958128

qual-bench.sh : add quality comparison tool, and update main.cpp to allow using a font file (#569)

Browse files
Files changed (2) hide show
  1. examples/main/main.cpp +11 -3
  2. extra/qual-bench.sh +67 -0
examples/main/main.cpp CHANGED
@@ -80,6 +80,7 @@ struct whisper_params {
80
 
81
  std::string language = "en";
82
  std::string prompt;
 
83
  std::string model = "models/ggml-base.en.bin";
84
 
85
  std::vector<std::string> fname_inp = {};
@@ -127,6 +128,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
127
  else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
128
  else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
129
  else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
 
130
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
131
  else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
132
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
@@ -174,6 +176,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
174
  fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
175
  fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
176
  fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
 
177
  fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
178
  fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
179
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
@@ -368,13 +371,18 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
368
  // karaoke video generation
369
  // outputs a bash script that uses ffmpeg to generate a video with the subtitles
370
  // TODO: font parameter adjustments
371
- bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
372
  std::ofstream fout(fname);
373
 
374
  fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
375
 
376
- // TODO: become parameter
377
- static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
 
 
 
 
 
378
 
379
  fout << "#!/bin/bash" << "\n";
380
  fout << "\n";
 
80
 
81
  std::string language = "en";
82
  std::string prompt;
83
+ std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
84
  std::string model = "models/ggml-base.en.bin";
85
 
86
  std::vector<std::string> fname_inp = {};
 
128
  else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
129
  else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
130
  else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
131
+ else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
132
  else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
133
  else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
134
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
 
176
  fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
177
  fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
178
  fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
179
+ fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
180
  fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
181
  fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
182
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
 
371
  // karaoke video generation
372
  // outputs a bash script that uses ffmpeg to generate a video with the subtitles
373
  // TODO: font parameter adjustments
374
+ bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
375
  std::ofstream fout(fname);
376
 
377
  fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
378
 
379
+ static const char * font = params.font_path.c_str();
380
+
381
+ std::ifstream fin(font);
382
+ if (!fin.is_open()) {
383
+ fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
384
+ return false;
385
+ }
386
 
387
  fout << "#!/bin/bash" << "\n";
388
  fout << "\n";
extra/qual-bench.sh ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This script takes two arguments
2
+ # - an audio file
3
+ # - [optional] path to a font file
4
+
5
+ # I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
6
+
7
+ if [ -z "$1" ]; then
8
+ echo "Usage: $0 <audio file> [font file]"
9
+ exit 1
10
+ fi
11
+
12
+ #TODO: Make this a command line parameter
13
+ models="base small large"
14
+ #models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
15
+
16
+ DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
17
+ DURATION=$(printf "%.2f" $DURATION)
18
+ echo "Input file duration: ${DURATION}s"
19
+
20
+ for model in $models; do
21
+ echo "Running $model"
22
+ COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
23
+
24
+ if [ ! -z "$2" ]; then
25
+ COMMAND="$COMMAND -fp $2"
26
+ fi
27
+ #TODO: Surface errors better
28
+ # TIMEFMT is for zsh, TIMEFORMAT is for bash
29
+ EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
30
+
31
+ # Slightly different formats between zsh and bash
32
+ if [ "${EXECTIME: -1}" == "s" ]; then
33
+ EXECTIME=${EXECTIME::-1}
34
+ fi
35
+
36
+ RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
37
+ RATIO=$(printf "%.2f" $RATIO)
38
+
39
+ echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
40
+
41
+ # If the file already exists, delete it
42
+ if [ -f $1.mp4 ]; then
43
+ rm $1.$model.mp4
44
+ fi
45
+
46
+ bash $1.$model.wts >/dev/null 2>&1
47
+ mv $1.mp4 $1.$model.mp4
48
+
49
+ ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
50
+ done
51
+
52
+ COMMAND="ffmpeg -y"
53
+ for model in $models; do
54
+ COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
55
+ done
56
+ COMMAND="$COMMAND -filter_complex \""
57
+ COUNT=0
58
+ for model in $models; do
59
+ COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
60
+ COUNT=$((COUNT+2))
61
+ done
62
+ COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
63
+
64
+ echo $COMMAND
65
+
66
+ # Run the command
67
+ eval $COMMAND