Spaces:
Running
Running
venkr
commited on
qual-bench.sh : add quality comparison tool, and update main.cpp to allow using a font file (#569)
Browse files- examples/main/main.cpp +11 -3
- extra/qual-bench.sh +67 -0
examples/main/main.cpp
CHANGED
|
@@ -80,6 +80,7 @@ struct whisper_params {
|
|
| 80 |
|
| 81 |
std::string language = "en";
|
| 82 |
std::string prompt;
|
|
|
|
| 83 |
std::string model = "models/ggml-base.en.bin";
|
| 84 |
|
| 85 |
std::vector<std::string> fname_inp = {};
|
|
@@ -127,6 +128,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 127 |
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
|
| 128 |
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
| 129 |
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
|
|
|
| 130 |
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
| 131 |
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
|
| 132 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
|
@@ -174,6 +176,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 174 |
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
| 175 |
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
| 176 |
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
|
|
|
| 177 |
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
| 178 |
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
| 179 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
@@ -368,13 +371,18 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
|
|
| 368 |
// karaoke video generation
|
| 369 |
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
| 370 |
// TODO: font parameter adjustments
|
| 371 |
-
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params &
|
| 372 |
std::ofstream fout(fname);
|
| 373 |
|
| 374 |
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
| 375 |
|
| 376 |
-
|
| 377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
fout << "#!/bin/bash" << "\n";
|
| 380 |
fout << "\n";
|
|
|
|
| 80 |
|
| 81 |
std::string language = "en";
|
| 82 |
std::string prompt;
|
| 83 |
+
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
| 84 |
std::string model = "models/ggml-base.en.bin";
|
| 85 |
|
| 86 |
std::vector<std::string> fname_inp = {};
|
|
|
|
| 128 |
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
|
| 129 |
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
| 130 |
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
| 131 |
+
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
| 132 |
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
| 133 |
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
|
| 134 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
|
|
|
| 176 |
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
| 177 |
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
| 178 |
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
| 179 |
+
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
|
| 180 |
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
| 181 |
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
| 182 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
|
|
|
| 371 |
// karaoke video generation
|
| 372 |
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
| 373 |
// TODO: font parameter adjustments
|
| 374 |
+
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
|
| 375 |
std::ofstream fout(fname);
|
| 376 |
|
| 377 |
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
| 378 |
|
| 379 |
+
static const char * font = params.font_path.c_str();
|
| 380 |
+
|
| 381 |
+
std::ifstream fin(font);
|
| 382 |
+
if (!fin.is_open()) {
|
| 383 |
+
fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
|
| 384 |
+
return false;
|
| 385 |
+
}
|
| 386 |
|
| 387 |
fout << "#!/bin/bash" << "\n";
|
| 388 |
fout << "\n";
|
extra/qual-bench.sh
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This script takes two arguments
|
| 2 |
+
# - an audio file
|
| 3 |
+
# - [optional] path to a font file
|
| 4 |
+
|
| 5 |
+
# I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
|
| 6 |
+
|
| 7 |
+
if [ -z "$1" ]; then
|
| 8 |
+
echo "Usage: $0 <audio file> [font file]"
|
| 9 |
+
exit 1
|
| 10 |
+
fi
|
| 11 |
+
|
| 12 |
+
#TODO: Make this a command line parameter
|
| 13 |
+
models="base small large"
|
| 14 |
+
#models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
|
| 15 |
+
|
| 16 |
+
DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
|
| 17 |
+
DURATION=$(printf "%.2f" $DURATION)
|
| 18 |
+
echo "Input file duration: ${DURATION}s"
|
| 19 |
+
|
| 20 |
+
for model in $models; do
|
| 21 |
+
echo "Running $model"
|
| 22 |
+
COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
|
| 23 |
+
|
| 24 |
+
if [ ! -z "$2" ]; then
|
| 25 |
+
COMMAND="$COMMAND -fp $2"
|
| 26 |
+
fi
|
| 27 |
+
#TODO: Surface errors better
|
| 28 |
+
# TIMEFMT is for zsh, TIMEFORMAT is for bash
|
| 29 |
+
EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
|
| 30 |
+
|
| 31 |
+
# Slightly different formats between zsh and bash
|
| 32 |
+
if [ "${EXECTIME: -1}" == "s" ]; then
|
| 33 |
+
EXECTIME=${EXECTIME::-1}
|
| 34 |
+
fi
|
| 35 |
+
|
| 36 |
+
RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
|
| 37 |
+
RATIO=$(printf "%.2f" $RATIO)
|
| 38 |
+
|
| 39 |
+
echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
|
| 40 |
+
|
| 41 |
+
# If the file already exists, delete it
|
| 42 |
+
if [ -f $1.mp4 ]; then
|
| 43 |
+
rm $1.$model.mp4
|
| 44 |
+
fi
|
| 45 |
+
|
| 46 |
+
bash $1.$model.wts >/dev/null 2>&1
|
| 47 |
+
mv $1.mp4 $1.$model.mp4
|
| 48 |
+
|
| 49 |
+
ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
|
| 50 |
+
done
|
| 51 |
+
|
| 52 |
+
COMMAND="ffmpeg -y"
|
| 53 |
+
for model in $models; do
|
| 54 |
+
COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
|
| 55 |
+
done
|
| 56 |
+
COMMAND="$COMMAND -filter_complex \""
|
| 57 |
+
COUNT=0
|
| 58 |
+
for model in $models; do
|
| 59 |
+
COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
|
| 60 |
+
COUNT=$((COUNT+2))
|
| 61 |
+
done
|
| 62 |
+
COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
|
| 63 |
+
|
| 64 |
+
echo $COMMAND
|
| 65 |
+
|
| 66 |
+
# Run the command
|
| 67 |
+
eval $COMMAND
|