Spaces:
Running
Running
add sync task
Browse files- Dockerfile +28 -0
- examples/server/server.cpp +470 -272
Dockerfile
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM ubuntu:22.04 AS build
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
|
| 4 |
+
RUN apt-get update && \
|
| 5 |
+
apt-get install -y build-essential wget cmake git \
|
| 6 |
+
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
| 7 |
+
|
| 8 |
+
COPY .. .
|
| 9 |
+
RUN make base.en
|
| 10 |
+
|
| 11 |
+
FROM ubuntu:22.04 AS runtime
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
|
| 14 |
+
RUN apt-get update && \
|
| 15 |
+
apt-get install -y curl ffmpeg libsdl2-dev wget cmake git \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
| 17 |
+
|
| 18 |
+
COPY --from=build /app /app
|
| 19 |
+
ENV PATH=/app/build/bin:$PATH
|
| 20 |
+
# ENTRYPOINT [ "bash", "-c" ]
|
| 21 |
+
|
| 22 |
+
RUN mkdir /models
|
| 23 |
+
RUN ./models/download-ggml-model.sh large-v3 /models
|
| 24 |
+
RUN ./models/download-vad-model.sh silero-v5.1.2 /models
|
| 25 |
+
|
| 26 |
+
EXPOSE 7860
|
| 27 |
+
|
| 28 |
+
CMD [ "whisper-server --host 0.0.0.0 --port 7860 --vad -vm /models/ggml-silero-v5.1.2.bin -m /models/ggml-large-v3.bin" ]
|
examples/server/server.cpp
CHANGED
|
@@ -19,6 +19,7 @@
|
|
| 19 |
#include <atomic>
|
| 20 |
#include <functional>
|
| 21 |
#include <cstdlib>
|
|
|
|
| 22 |
#if defined (_WIN32)
|
| 23 |
#include <windows.h>
|
| 24 |
#endif
|
|
@@ -610,6 +611,31 @@ void get_req_parameters(const Request & req, whisper_params & params)
|
|
| 610 |
|
| 611 |
} // namespace
|
| 612 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
int main(int argc, char ** argv) {
|
| 614 |
ggml_backend_load_all();
|
| 615 |
|
|
@@ -719,27 +745,29 @@ int main(int argc, char ** argv) {
|
|
| 719 |
<style>
|
| 720 |
body {
|
| 721 |
font-family: sans-serif;
|
|
|
|
|
|
|
|
|
|
| 722 |
}
|
| 723 |
form {
|
| 724 |
display: flex;
|
| 725 |
flex-direction: column;
|
| 726 |
align-items: flex-start;
|
|
|
|
| 727 |
}
|
| 728 |
label {
|
| 729 |
margin-bottom: 0.5rem;
|
| 730 |
}
|
| 731 |
-
input, select {
|
| 732 |
margin-bottom: 1rem;
|
| 733 |
}
|
| 734 |
-
|
| 735 |
-
margin-top: 1rem;
|
| 736 |
-
}
|
| 737 |
</style>
|
| 738 |
</head>
|
| 739 |
<body>
|
| 740 |
<h1>Whisper.cpp Server</h1>
|
| 741 |
|
| 742 |
-
<h2>/inference</h2>
|
| 743 |
<pre>
|
| 744 |
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
|
| 745 |
-H "Content-Type: multipart/form-data" \
|
|
@@ -756,14 +784,11 @@ int main(int argc, char ** argv) {
|
|
| 756 |
-F model="<path-to-model-file>"
|
| 757 |
</pre>
|
| 758 |
|
| 759 |
-
<div>
|
| 760 |
-
<h2
|
| 761 |
<form action="/inference" method="POST" enctype="multipart/form-data">
|
| 762 |
<label for="file">Choose an audio file:</label>
|
| 763 |
-
<input type="file" id="file" name="file" accept="audio/*" required
|
| 764 |
-
|
| 765 |
-
<label for="temperature">Temperature:</label>
|
| 766 |
-
<input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
|
| 767 |
|
| 768 |
<label for="response_format">Response Format:</label>
|
| 769 |
<select id="response_format" name="response_format">
|
|
@@ -772,11 +797,120 @@ int main(int argc, char ** argv) {
|
|
| 772 |
<option value="text">Text</option>
|
| 773 |
<option value="srt">SRT</option>
|
| 774 |
<option value="vtt">VTT</option>
|
| 775 |
-
</select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
|
| 777 |
-
<button
|
| 778 |
</form>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 779 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 780 |
</body>
|
| 781 |
</html>
|
| 782 |
)";
|
|
@@ -793,323 +927,387 @@ int main(int argc, char ** argv) {
|
|
| 793 |
svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
|
| 794 |
});
|
| 795 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 796 |
svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
|
| 797 |
-
//
|
| 798 |
std::lock_guard<std::mutex> lock(whisper_mutex);
|
| 799 |
|
| 800 |
-
|
| 801 |
-
if (!req.has_file("file"))
|
| 802 |
-
{
|
| 803 |
-
fprintf(stderr, "error: no 'file' field in the request\n");
|
| 804 |
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
|
| 805 |
res.set_content(error_resp, "application/json");
|
| 806 |
return;
|
| 807 |
}
|
| 808 |
auto audio_file = req.get_file_value("file");
|
| 809 |
-
|
| 810 |
-
// check non-required fields
|
| 811 |
get_req_parameters(req, params);
|
| 812 |
|
| 813 |
-
std::
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
// audio arrays
|
| 817 |
-
std::vector<float> pcmf32; // mono-channel F32 PCM
|
| 818 |
-
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
| 819 |
-
|
| 820 |
if (sparams.ffmpeg_converter) {
|
| 821 |
-
// if file is not wav, convert to wav
|
| 822 |
-
// write to temporary file
|
| 823 |
const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
|
| 824 |
std::ofstream temp_file{temp_filename, std::ios::binary};
|
| 825 |
temp_file << audio_file.content;
|
| 826 |
temp_file.close();
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
const bool is_converted = convert_to_wav(temp_filename, error_resp);
|
| 830 |
-
if (!is_converted) {
|
| 831 |
res.set_content(error_resp, "application/json");
|
| 832 |
return;
|
| 833 |
}
|
| 834 |
-
|
| 835 |
-
// read audio content into pcmf32
|
| 836 |
-
if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, params.diarize))
|
| 837 |
-
{
|
| 838 |
-
fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
|
| 839 |
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
|
| 840 |
res.set_content(error_resp, "application/json");
|
| 841 |
std::remove(temp_filename.c_str());
|
| 842 |
return;
|
| 843 |
}
|
| 844 |
-
// remove temp file
|
| 845 |
std::remove(temp_filename.c_str());
|
| 846 |
} else {
|
| 847 |
-
if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
|
| 848 |
-
{
|
| 849 |
-
fprintf(stderr, "error: failed to read audio data\n");
|
| 850 |
const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
|
| 851 |
res.set_content(error_resp, "application/json");
|
| 852 |
return;
|
| 853 |
}
|
| 854 |
}
|
| 855 |
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
// print system information
|
| 859 |
{
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
}
|
| 864 |
|
| 865 |
-
//
|
|
|
|
|
|
|
|
|
|
| 866 |
{
|
| 867 |
-
|
| 868 |
-
if (
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
|
|
|
| 874 |
}
|
| 875 |
-
|
| 876 |
-
params.language = "auto";
|
| 877 |
-
}
|
| 878 |
-
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
|
| 879 |
-
__func__, filename.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
| 880 |
-
params.n_threads, params.n_processors,
|
| 881 |
-
params.language.c_str(),
|
| 882 |
-
params.translate ? "translate" : "transcribe",
|
| 883 |
-
params.tinydiarize ? "tdrz = 1, " : "",
|
| 884 |
-
params.no_timestamps ? 0 : 1);
|
| 885 |
-
|
| 886 |
-
fprintf(stderr, "\n");
|
| 887 |
}
|
|
|
|
| 888 |
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
wparams.print_realtime = false;
|
| 897 |
-
wparams.print_progress = params.print_progress;
|
| 898 |
-
wparams.print_timestamps = !params.no_timestamps;
|
| 899 |
-
wparams.print_special = params.print_special;
|
| 900 |
-
wparams.translate = params.translate;
|
| 901 |
-
wparams.language = params.language.c_str();
|
| 902 |
-
wparams.detect_language = params.detect_language;
|
| 903 |
-
wparams.n_threads = params.n_threads;
|
| 904 |
-
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
| 905 |
-
wparams.offset_ms = params.offset_t_ms;
|
| 906 |
-
wparams.duration_ms = params.duration_ms;
|
| 907 |
-
|
| 908 |
-
wparams.thold_pt = params.word_thold;
|
| 909 |
-
wparams.max_len = params.max_len == 0 ? 60 : params.max_len;
|
| 910 |
-
wparams.split_on_word = params.split_on_word;
|
| 911 |
-
wparams.audio_ctx = params.audio_ctx;
|
| 912 |
-
|
| 913 |
-
wparams.debug_mode = params.debug_mode;
|
| 914 |
-
|
| 915 |
-
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
| 916 |
-
|
| 917 |
-
wparams.initial_prompt = params.prompt.c_str();
|
| 918 |
-
|
| 919 |
-
wparams.greedy.best_of = params.best_of;
|
| 920 |
-
wparams.beam_search.beam_size = params.beam_size;
|
| 921 |
-
|
| 922 |
-
wparams.temperature = params.temperature;
|
| 923 |
-
wparams.no_speech_thold = params.no_speech_thold;
|
| 924 |
-
wparams.temperature_inc = params.temperature_inc;
|
| 925 |
-
wparams.entropy_thold = params.entropy_thold;
|
| 926 |
-
wparams.logprob_thold = params.logprob_thold;
|
| 927 |
-
|
| 928 |
-
wparams.no_timestamps = params.no_timestamps;
|
| 929 |
-
wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
|
| 930 |
-
wparams.no_context = params.no_context;
|
| 931 |
-
|
| 932 |
-
wparams.suppress_nst = params.suppress_nst;
|
| 933 |
|
| 934 |
-
|
| 935 |
-
|
|
|
|
| 936 |
|
| 937 |
-
|
| 938 |
-
wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
|
| 939 |
-
wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
|
| 940 |
-
wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
|
| 941 |
-
wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
|
| 942 |
-
wparams.vad_params.samples_overlap = params.vad_samples_overlap;
|
| 943 |
|
| 944 |
-
|
|
|
|
| 945 |
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 950 |
}
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
|
|
|
| 955 |
}
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
auto req_ptr = static_cast<const httplib::Request*>(user_data);
|
| 961 |
-
return req_ptr->is_connection_closed();
|
| 962 |
-
};
|
| 963 |
-
wparams.abort_callback_user_data = (void*)&req;
|
| 964 |
-
|
| 965 |
-
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
| 966 |
-
// handle failure or early abort
|
| 967 |
-
if (req.is_connection_closed()) {
|
| 968 |
-
// log client disconnect
|
| 969 |
-
fprintf(stderr, "client disconnected, aborted processing\n");
|
| 970 |
-
res.status = 499; // Client Closed Request (nginx convention)
|
| 971 |
-
res.set_content("{\"error\":\"client disconnected\"}", "application/json");
|
| 972 |
-
return;
|
| 973 |
-
}
|
| 974 |
-
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
| 975 |
-
res.status = 500; // Internal Server Error
|
| 976 |
-
const std::string error_resp = "{\"error\":\"failed to process audio\"}";
|
| 977 |
res.set_content(error_resp, "application/json");
|
| 978 |
return;
|
| 979 |
}
|
| 980 |
}
|
| 981 |
|
| 982 |
-
//
|
| 983 |
-
|
| 984 |
{
|
| 985 |
-
std::
|
| 986 |
-
|
|
|
|
| 987 |
}
|
| 988 |
-
else if (params.response_format == srt_format)
|
| 989 |
-
{
|
| 990 |
-
std::stringstream ss;
|
| 991 |
-
const int n_segments = whisper_full_n_segments(ctx);
|
| 992 |
-
for (int i = 0; i < n_segments; ++i) {
|
| 993 |
-
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 994 |
-
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 995 |
-
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 996 |
-
std::string speaker = "";
|
| 997 |
-
|
| 998 |
-
if (params.diarize && pcmf32s.size() == 2)
|
| 999 |
-
{
|
| 1000 |
-
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
|
| 1001 |
-
}
|
| 1002 |
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 1016 |
-
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 1017 |
-
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 1018 |
-
std::string speaker = "";
|
| 1019 |
-
|
| 1020 |
-
if (params.diarize && pcmf32s.size() == 2)
|
| 1021 |
-
{
|
| 1022 |
-
speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
|
| 1023 |
-
speaker.insert(0, "<v Speaker");
|
| 1024 |
-
speaker.append(">");
|
| 1025 |
-
}
|
| 1026 |
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
res.set_content(
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
|
| 1045 |
-
jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
|
| 1046 |
-
jres["detected_language_probability"] = lang_probs[detected_lang_id];
|
| 1047 |
-
jres["language_probabilities"] = json::object();
|
| 1048 |
-
// Add all language probabilities
|
| 1049 |
-
for (int i = 0; i <= whisper_lang_max_id(); ++i) {
|
| 1050 |
-
if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities
|
| 1051 |
-
jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
|
| 1052 |
-
}
|
| 1053 |
-
}
|
| 1054 |
}
|
| 1055 |
-
const int n_segments = whisper_full_n_segments(ctx);
|
| 1056 |
-
for (int i = 0; i < n_segments; ++i)
|
| 1057 |
-
{
|
| 1058 |
-
json segment = json{
|
| 1059 |
-
{"id", i},
|
| 1060 |
-
{"text", whisper_full_get_segment_text(ctx, i)},
|
| 1061 |
-
};
|
| 1062 |
-
|
| 1063 |
-
if (!params.no_timestamps) {
|
| 1064 |
-
segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
|
| 1065 |
-
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
|
| 1066 |
-
}
|
| 1067 |
-
|
| 1068 |
-
float total_logprob = 0;
|
| 1069 |
-
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
| 1070 |
-
for (int j = 0; j < n_tokens; ++j) {
|
| 1071 |
-
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
|
| 1072 |
-
if (token.id >= whisper_token_eot(ctx)) {
|
| 1073 |
-
continue;
|
| 1074 |
-
}
|
| 1075 |
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
|
| 1086 |
-
|
| 1087 |
-
|
| 1088 |
-
|
| 1089 |
-
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
segment["no_speech_prob"] = whisper_full_get_segment_no_speech_prob(ctx, i);
|
| 1094 |
-
|
| 1095 |
-
jres["segments"].push_back(segment);
|
| 1096 |
}
|
| 1097 |
-
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
|
| 1098 |
-
"application/json");
|
| 1099 |
-
}
|
| 1100 |
-
// TODO add more output formats
|
| 1101 |
-
else
|
| 1102 |
-
{
|
| 1103 |
-
std::string results = output_str(ctx, params, pcmf32s);
|
| 1104 |
-
json jres = json{
|
| 1105 |
-
{"text", results}
|
| 1106 |
-
};
|
| 1107 |
-
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
|
| 1108 |
-
"application/json");
|
| 1109 |
}
|
| 1110 |
|
| 1111 |
-
|
| 1112 |
-
params = default_params;
|
| 1113 |
});
|
| 1114 |
svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
|
| 1115 |
std::lock_guard<std::mutex> lock(whisper_mutex);
|
|
|
|
| 19 |
#include <atomic>
|
| 20 |
#include <functional>
|
| 21 |
#include <cstdlib>
|
| 22 |
+
#include <unordered_map>
|
| 23 |
#if defined (_WIN32)
|
| 24 |
#include <windows.h>
|
| 25 |
#endif
|
|
|
|
| 611 |
|
| 612 |
} // namespace
|
| 613 |
|
| 614 |
+
// Async task management
|
| 615 |
+
namespace {
|
| 616 |
+
enum class async_status { PENDING, RUNNING, FINISHED, FAILED };
|
| 617 |
+
|
| 618 |
+
struct async_task_t {
|
| 619 |
+
async_status status = async_status::PENDING;
|
| 620 |
+
std::string result; // final response body
|
| 621 |
+
std::string content_type = "application/json";
|
| 622 |
+
std::string error; // error message if failed
|
| 623 |
+
};
|
| 624 |
+
|
| 625 |
+
static std::unordered_map<std::string, async_task_t> tasks;
|
| 626 |
+
static std::mutex tasks_mutex;
|
| 627 |
+
static std::atomic<uint64_t> task_counter{0};
|
| 628 |
+
|
| 629 |
+
std::string generate_task_id() {
|
| 630 |
+
const uint64_t id = ++task_counter;
|
| 631 |
+
auto now = std::chrono::system_clock::now();
|
| 632 |
+
auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
|
| 633 |
+
std::stringstream ss;
|
| 634 |
+
ss << id << "-" << now_ms;
|
| 635 |
+
return ss.str();
|
| 636 |
+
}
|
| 637 |
+
}
|
| 638 |
+
|
| 639 |
int main(int argc, char ** argv) {
|
| 640 |
ggml_backend_load_all();
|
| 641 |
|
|
|
|
| 745 |
<style>
|
| 746 |
body {
|
| 747 |
font-family: sans-serif;
|
| 748 |
+
max-width: 900px;
|
| 749 |
+
margin: 1rem auto;
|
| 750 |
+
padding: 0 1rem;
|
| 751 |
}
|
| 752 |
form {
|
| 753 |
display: flex;
|
| 754 |
flex-direction: column;
|
| 755 |
align-items: flex-start;
|
| 756 |
+
margin-bottom: 1.5rem;
|
| 757 |
}
|
| 758 |
label {
|
| 759 |
margin-bottom: 0.5rem;
|
| 760 |
}
|
| 761 |
+
input, select, button, textarea {
|
| 762 |
margin-bottom: 1rem;
|
| 763 |
}
|
| 764 |
+
.box { border: 1px solid #ddd; padding: 1rem; border-radius: 6px; }
|
|
|
|
|
|
|
| 765 |
</style>
|
| 766 |
</head>
|
| 767 |
<body>
|
| 768 |
<h1>Whisper.cpp Server</h1>
|
| 769 |
|
| 770 |
+
<h2>/inference (同步示例)</h2>
|
| 771 |
<pre>
|
| 772 |
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
|
| 773 |
-H "Content-Type: multipart/form-data" \
|
|
|
|
| 784 |
-F model="<path-to-model-file>"
|
| 785 |
</pre>
|
| 786 |
|
| 787 |
+
<div class="box">
|
| 788 |
+
<h2>同步 Try it out</h2>
|
| 789 |
<form action="/inference" method="POST" enctype="multipart/form-data">
|
| 790 |
<label for="file">Choose an audio file:</label>
|
| 791 |
+
<input type="file" id="file" name="file" accept="audio/*" required>
|
|
|
|
|
|
|
|
|
|
| 792 |
|
| 793 |
<label for="response_format">Response Format:</label>
|
| 794 |
<select id="response_format" name="response_format">
|
|
|
|
| 797 |
<option value="text">Text</option>
|
| 798 |
<option value="srt">SRT</option>
|
| 799 |
<option value="vtt">VTT</option>
|
| 800 |
+
</select>
|
| 801 |
+
|
| 802 |
+
<button type="submit">Run synchronous inference</button>
|
| 803 |
+
</form>
|
| 804 |
+
</div>
|
| 805 |
+
|
| 806 |
+
<div class="box">
|
| 807 |
+
<h2>异步示例(可运行)</h2>
|
| 808 |
+
<form id="async-form" enctype="multipart/form-data">
|
| 809 |
+
<label for="afile">Choose an audio file:</label>
|
| 810 |
+
<input type="file" id="afile" name="file" accept="audio/*" required>
|
| 811 |
+
|
| 812 |
+
<label for="a_response_format">Response Format:</label>
|
| 813 |
+
<select id="a_response_format" name="response_format">
|
| 814 |
+
<option value="json">JSON</option>
|
| 815 |
+
<option value="text">Text</option>
|
| 816 |
+
<option value="srt">SRT</option>
|
| 817 |
+
<option value="vtt">VTT</option>
|
| 818 |
+
<option value="verbose_json">Verbose JSON</option>
|
| 819 |
+
</select>
|
| 820 |
|
| 821 |
+
<button id="async-submit" type="button">Submit async job</button>
|
| 822 |
</form>
|
| 823 |
+
|
| 824 |
+
<div>
|
| 825 |
+
<h3>Task</h3>
|
| 826 |
+
<div id="task-id">(no task)</div>
|
| 827 |
+
|
| 828 |
+
<h3>Result / Status</h3>
|
| 829 |
+
<pre id="task-result" style="white-space:pre-wrap; background:#f7f7f7; padding:0.5rem; border-radius:4px;"></pre>
|
| 830 |
+
</div>
|
| 831 |
</div>
|
| 832 |
+
|
| 833 |
+
<script>
|
| 834 |
+
(function(){
|
| 835 |
+
const submitBtn = document.getElementById('async-submit');
|
| 836 |
+
const form = document.getElementById('async-form');
|
| 837 |
+
const taskIdEl = document.getElementById('task-id');
|
| 838 |
+
const resultEl = document.getElementById('task-result');
|
| 839 |
+
let currentTask = null;
|
| 840 |
+
|
| 841 |
+
function sleep(ms){ return new Promise(r=>setTimeout(r, ms)); }
|
| 842 |
+
|
| 843 |
+
async function pollTask(id){
|
| 844 |
+
taskIdEl.textContent = id;
|
| 845 |
+
resultEl.textContent = 'processing...';
|
| 846 |
+
while (true) {
|
| 847 |
+
try {
|
| 848 |
+
const resp = await fetch('/inference_result?id=' + encodeURIComponent(id));
|
| 849 |
+
if (resp.status === 404) {
|
| 850 |
+
resultEl.textContent = 'task not found';
|
| 851 |
+
return;
|
| 852 |
+
}
|
| 853 |
+
const ctype = resp.headers.get('content-type') || '';
|
| 854 |
+
if (ctype.indexOf('application/json') !== -1) {
|
| 855 |
+
const j = await resp.json();
|
| 856 |
+
// if processing status, continue polling
|
| 857 |
+
if (j && j.status && (j.status === 'processing')) {
|
| 858 |
+
await sleep(1000);
|
| 859 |
+
continue;
|
| 860 |
+
}
|
| 861 |
+
resultEl.textContent = JSON.stringify(j, null, 2);
|
| 862 |
+
return;
|
| 863 |
+
} else {
|
| 864 |
+
// non-json (final text or srt/vtt)
|
| 865 |
+
const txt = await resp.text();
|
| 866 |
+
// if it's the processing JSON returned with application/json, handle above
|
| 867 |
+
if (txt && txt.indexOf('{"status":"processing"') !== -1) {
|
| 868 |
+
await sleep(1000);
|
| 869 |
+
continue;
|
| 870 |
+
}
|
| 871 |
+
resultEl.textContent = txt;
|
| 872 |
+
return;
|
| 873 |
+
}
|
| 874 |
+
} catch (err) {
|
| 875 |
+
resultEl.textContent = 'error: ' + err.message;
|
| 876 |
+
return;
|
| 877 |
+
}
|
| 878 |
+
}
|
| 879 |
+
}
|
| 880 |
+
|
| 881 |
+
submitBtn.addEventListener('click', async function(){
|
| 882 |
+
resultEl.textContent = '';
|
| 883 |
+
const fileInput = document.getElementById('afile');
|
| 884 |
+
if (!fileInput.files || fileInput.files.length === 0) {
|
| 885 |
+
alert('Please choose a file');
|
| 886 |
+
return;
|
| 887 |
+
}
|
| 888 |
+
const fd = new FormData();
|
| 889 |
+
fd.append('file', fileInput.files[0]);
|
| 890 |
+
fd.append('response_format', document.getElementById('a_response_format').value);
|
| 891 |
+
|
| 892 |
+
submitBtn.disabled = true;
|
| 893 |
+
submitBtn.textContent = 'Submitting...';
|
| 894 |
+
|
| 895 |
+
try {
|
| 896 |
+
const resp = await fetch('/inference_async', { method: 'POST', body: fd });
|
| 897 |
+
const j = await resp.json();
|
| 898 |
+
if (j && j.task_id) {
|
| 899 |
+
currentTask = j.task_id;
|
| 900 |
+
pollTask(currentTask);
|
| 901 |
+
} else {
|
| 902 |
+
resultEl.textContent = 'invalid response: ' + JSON.stringify(j);
|
| 903 |
+
}
|
| 904 |
+
} catch (err) {
|
| 905 |
+
resultEl.textContent = 'submit error: ' + err.message;
|
| 906 |
+
} finally {
|
| 907 |
+
submitBtn.disabled = false;
|
| 908 |
+
submitBtn.textContent = 'Submit async job';
|
| 909 |
+
}
|
| 910 |
+
});
|
| 911 |
+
})();
|
| 912 |
+
</script>
|
| 913 |
+
|
| 914 |
</body>
|
| 915 |
</html>
|
| 916 |
)";
|
|
|
|
| 927 |
svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
|
| 928 |
});
|
| 929 |
|
| 930 |
+
// Helper: run inference for a prepared audio buffer and params, store response in task
|
| 931 |
+
auto run_inference_task = [&](const std::string & task_id,
|
| 932 |
+
whisper_params task_params,
|
| 933 |
+
std::vector<float> pcmf32,
|
| 934 |
+
std::vector<std::vector<float>> pcmf32s,
|
| 935 |
+
const Request * orig_req) {
|
| 936 |
+
{
|
| 937 |
+
std::lock_guard<std::mutex> tlock(tasks_mutex);
|
| 938 |
+
tasks[task_id].status = async_status::RUNNING;
|
| 939 |
+
}
|
| 940 |
+
|
| 941 |
+
try {
|
| 942 |
+
// set up whisper params
|
| 943 |
+
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 944 |
+
|
| 945 |
+
wparams.strategy = task_params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
|
| 946 |
+
|
| 947 |
+
wparams.print_realtime = false;
|
| 948 |
+
wparams.print_progress = task_params.print_progress;
|
| 949 |
+
wparams.print_timestamps = !task_params.no_timestamps;
|
| 950 |
+
wparams.print_special = task_params.print_special;
|
| 951 |
+
wparams.translate = task_params.translate;
|
| 952 |
+
wparams.language = task_params.language.c_str();
|
| 953 |
+
wparams.detect_language = task_params.detect_language;
|
| 954 |
+
wparams.n_threads = task_params.n_threads;
|
| 955 |
+
wparams.n_max_text_ctx = task_params.max_context >= 0 ? task_params.max_context : wparams.n_max_text_ctx;
|
| 956 |
+
wparams.offset_ms = task_params.offset_t_ms;
|
| 957 |
+
wparams.duration_ms = task_params.duration_ms;
|
| 958 |
+
|
| 959 |
+
wparams.thold_pt = task_params.word_thold;
|
| 960 |
+
wparams.max_len = task_params.max_len == 0 ? 60 : task_params.max_len;
|
| 961 |
+
wparams.split_on_word = task_params.split_on_word;
|
| 962 |
+
wparams.audio_ctx = task_params.audio_ctx;
|
| 963 |
+
|
| 964 |
+
wparams.debug_mode = task_params.debug_mode;
|
| 965 |
+
|
| 966 |
+
wparams.tdrz_enable = task_params.tinydiarize; // [TDRZ]
|
| 967 |
+
|
| 968 |
+
wparams.initial_prompt = task_params.prompt.c_str();
|
| 969 |
+
|
| 970 |
+
wparams.greedy.best_of = task_params.best_of;
|
| 971 |
+
wparams.beam_search.beam_size = task_params.beam_size;
|
| 972 |
+
|
| 973 |
+
wparams.temperature = task_params.temperature;
|
| 974 |
+
wparams.no_speech_thold = task_params.no_speech_thold;
|
| 975 |
+
wparams.temperature_inc = task_params.temperature_inc;
|
| 976 |
+
wparams.entropy_thold = task_params.entropy_thold;
|
| 977 |
+
wparams.logprob_thold = task_params.logprob_thold;
|
| 978 |
+
|
| 979 |
+
wparams.no_timestamps = task_params.no_timestamps;
|
| 980 |
+
wparams.token_timestamps = !task_params.no_timestamps && task_params.response_format == vjson_format;
|
| 981 |
+
wparams.no_context = task_params.no_context;
|
| 982 |
+
|
| 983 |
+
wparams.suppress_nst = task_params.suppress_nst;
|
| 984 |
+
|
| 985 |
+
wparams.vad = task_params.vad;
|
| 986 |
+
wparams.vad_model_path = task_params.vad_model.c_str();
|
| 987 |
+
|
| 988 |
+
wparams.vad_params.threshold = task_params.vad_threshold;
|
| 989 |
+
wparams.vad_params.min_speech_duration_ms = task_params.vad_min_speech_duration_ms;
|
| 990 |
+
wparams.vad_params.min_silence_duration_ms = task_params.vad_min_silence_duration_ms;
|
| 991 |
+
wparams.vad_params.max_speech_duration_s = task_params.vad_max_speech_duration_s;
|
| 992 |
+
wparams.vad_params.speech_pad_ms = task_params.vad_speech_pad_ms;
|
| 993 |
+
wparams.vad_params.samples_overlap = task_params.vad_samples_overlap;
|
| 994 |
+
|
| 995 |
+
whisper_print_user_data user_data = { &task_params, &pcmf32s, 0 };
|
| 996 |
+
|
| 997 |
+
if (task_params.print_realtime) {
|
| 998 |
+
wparams.new_segment_callback = whisper_print_segment_callback;
|
| 999 |
+
wparams.new_segment_callback_user_data = &user_data;
|
| 1000 |
+
}
|
| 1001 |
+
|
| 1002 |
+
if (wparams.print_progress) {
|
| 1003 |
+
wparams.progress_callback = whisper_print_progress_callback;
|
| 1004 |
+
wparams.progress_callback_user_data = &user_data;
|
| 1005 |
+
}
|
| 1006 |
+
|
| 1007 |
+
// abort callback uses original request pointer if provided
|
| 1008 |
+
// ggml_abort_callback expects a function returning bool
|
| 1009 |
+
wparams.abort_callback = [](void *user_data)->bool {
|
| 1010 |
+
if (!user_data) return false;
|
| 1011 |
+
auto req_ptr = static_cast<const httplib::Request*>(user_data);
|
| 1012 |
+
return req_ptr->is_connection_closed();
|
| 1013 |
+
};
|
| 1014 |
+
wparams.abort_callback_user_data = (void*)orig_req;
|
| 1015 |
+
|
| 1016 |
+
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), task_params.n_processors) != 0) {
|
| 1017 |
+
// failure
|
| 1018 |
+
std::lock_guard<std::mutex> tlock(tasks_mutex);
|
| 1019 |
+
tasks[task_id].status = async_status::FAILED;
|
| 1020 |
+
tasks[task_id].error = "failed to process audio";
|
| 1021 |
+
tasks[task_id].result = "";
|
| 1022 |
+
return;
|
| 1023 |
+
}
|
| 1024 |
+
|
| 1025 |
+
// prepare response according to format
|
| 1026 |
+
std::string content;
|
| 1027 |
+
std::string ctype = "application/json";
|
| 1028 |
+
if (task_params.response_format == text_format) {
|
| 1029 |
+
content = output_str(ctx, task_params, pcmf32s);
|
| 1030 |
+
ctype = "text/plain; charset=utf-8";
|
| 1031 |
+
} else if (task_params.response_format == srt_format) {
|
| 1032 |
+
std::stringstream ss;
|
| 1033 |
+
const int n_segments = whisper_full_n_segments(ctx);
|
| 1034 |
+
for (int i = 0; i < n_segments; ++i) {
|
| 1035 |
+
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 1036 |
+
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 1037 |
+
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 1038 |
+
std::string speaker = "";
|
| 1039 |
+
if (task_params.diarize && pcmf32s.size() == 2) {
|
| 1040 |
+
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
|
| 1041 |
+
}
|
| 1042 |
+
ss << i + 1 + task_params.offset_n << "\n";
|
| 1043 |
+
ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
|
| 1044 |
+
ss << speaker << text << "\n\n";
|
| 1045 |
+
}
|
| 1046 |
+
content = ss.str();
|
| 1047 |
+
ctype = "application/x-subrip";
|
| 1048 |
+
} else if (task_params.response_format == vtt_format) {
|
| 1049 |
+
std::stringstream ss;
|
| 1050 |
+
ss << "WEBVTT\n\n";
|
| 1051 |
+
const int n_segments = whisper_full_n_segments(ctx);
|
| 1052 |
+
for (int i = 0; i < n_segments; ++i) {
|
| 1053 |
+
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 1054 |
+
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
| 1055 |
+
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
| 1056 |
+
std::string speaker = "";
|
| 1057 |
+
if (task_params.diarize && pcmf32s.size() == 2) {
|
| 1058 |
+
speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
|
| 1059 |
+
speaker.insert(0, "<v Speaker");
|
| 1060 |
+
speaker.append(">");
|
| 1061 |
+
}
|
| 1062 |
+
ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
|
| 1063 |
+
ss << speaker << text << "\n\n";
|
| 1064 |
+
}
|
| 1065 |
+
content = ss.str();
|
| 1066 |
+
ctype = "text/vtt";
|
| 1067 |
+
} else if (task_params.response_format == vjson_format) {
|
| 1068 |
+
std::string results = output_str(ctx, task_params, pcmf32s);
|
| 1069 |
+
json jres = json{
|
| 1070 |
+
{"task", task_params.translate ? "translate" : "transcribe"},
|
| 1071 |
+
{"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
|
| 1072 |
+
{"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
|
| 1073 |
+
{"text", results},
|
| 1074 |
+
{"segments", json::array()}
|
| 1075 |
+
};
|
| 1076 |
+
if (!task_params.no_language_probabilities) {
|
| 1077 |
+
std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
|
| 1078 |
+
const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, task_params.n_threads, lang_probs.data());
|
| 1079 |
+
jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
|
| 1080 |
+
jres["detected_language_probability"] = lang_probs[detected_lang_id];
|
| 1081 |
+
jres["language_probabilities"] = json::object();
|
| 1082 |
+
for (int i = 0; i <= whisper_lang_max_id(); ++i) {
|
| 1083 |
+
if (lang_probs[i] > 0.001f) {
|
| 1084 |
+
jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
|
| 1085 |
+
}
|
| 1086 |
+
}
|
| 1087 |
+
}
|
| 1088 |
+
const int n_segments = whisper_full_n_segments(ctx);
|
| 1089 |
+
for (int i = 0; i < n_segments; ++i) {
|
| 1090 |
+
json segment = json{{"id", i}, {"text", whisper_full_get_segment_text(ctx, i)}};
|
| 1091 |
+
if (!task_params.no_timestamps) {
|
| 1092 |
+
segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
|
| 1093 |
+
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
|
| 1094 |
+
}
|
| 1095 |
+
float total_logprob = 0;
|
| 1096 |
+
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
| 1097 |
+
for (int j = 0; j < n_tokens; ++j) {
|
| 1098 |
+
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
|
| 1099 |
+
if (token.id >= whisper_token_eot(ctx)) continue;
|
| 1100 |
+
segment["tokens"].push_back(token.id);
|
| 1101 |
+
json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
|
| 1102 |
+
if (!task_params.no_timestamps) {
|
| 1103 |
+
word["start"] = token.t0 * 0.01;
|
| 1104 |
+
word["end"] = token.t1 * 0.01;
|
| 1105 |
+
word["t_dtw"] = token.t_dtw;
|
| 1106 |
+
}
|
| 1107 |
+
word["probability"] = token.p;
|
| 1108 |
+
total_logprob += token.plog;
|
| 1109 |
+
segment["words"].push_back(word);
|
| 1110 |
+
}
|
| 1111 |
+
segment["temperature"] = task_params.temperature;
|
| 1112 |
+
segment["avg_logprob"] = total_logprob / n_tokens;
|
| 1113 |
+
segment["no_speech_prob"] = whisper_full_get_segment_no_speech_prob(ctx, i);
|
| 1114 |
+
jres["segments"].push_back(segment);
|
| 1115 |
+
}
|
| 1116 |
+
content = jres.dump(-1, ' ', false, json::error_handler_t::replace);
|
| 1117 |
+
ctype = "application/json";
|
| 1118 |
+
} else {
|
| 1119 |
+
std::string results = output_str(ctx, task_params, pcmf32s);
|
| 1120 |
+
json jres = json{{"text", results}};
|
| 1121 |
+
content = jres.dump(-1, ' ', false, json::error_handler_t::replace);
|
| 1122 |
+
ctype = "application/json";
|
| 1123 |
+
}
|
| 1124 |
+
|
| 1125 |
+
// store result
|
| 1126 |
+
{
|
| 1127 |
+
std::lock_guard<std::mutex> tlock(tasks_mutex);
|
| 1128 |
+
tasks[task_id].status = async_status::FINISHED;
|
| 1129 |
+
tasks[task_id].result = content;
|
| 1130 |
+
tasks[task_id].content_type = ctype;
|
| 1131 |
+
}
|
| 1132 |
+
|
| 1133 |
+
} catch (const std::exception &e) {
|
| 1134 |
+
std::lock_guard<std::mutex> tlock(tasks_mutex);
|
| 1135 |
+
tasks[task_id].status = async_status::FAILED;
|
| 1136 |
+
tasks[task_id].error = e.what();
|
| 1137 |
+
tasks[task_id].result.clear();
|
| 1138 |
+
}
|
| 1139 |
+
};
|
| 1140 |
+
|
| 1141 |
+
// Synchronous inference kept for compatibility at original path
|
| 1142 |
svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
|
| 1143 |
+
// existing synchronous behavior: simply call async helper synchronously while holding mutex
|
| 1144 |
std::lock_guard<std::mutex> lock(whisper_mutex);
|
| 1145 |
|
| 1146 |
+
if (!req.has_file("file")) {
|
|
|
|
|
|
|
|
|
|
| 1147 |
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
|
| 1148 |
res.set_content(error_resp, "application/json");
|
| 1149 |
return;
|
| 1150 |
}
|
| 1151 |
auto audio_file = req.get_file_value("file");
|
| 1152 |
+
// gather parameters
|
|
|
|
| 1153 |
get_req_parameters(req, params);
|
| 1154 |
|
| 1155 |
+
std::vector<float> pcmf32;
|
| 1156 |
+
std::vector<std::vector<float>> pcmf32s;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1157 |
if (sparams.ffmpeg_converter) {
|
|
|
|
|
|
|
| 1158 |
const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
|
| 1159 |
std::ofstream temp_file{temp_filename, std::ios::binary};
|
| 1160 |
temp_file << audio_file.content;
|
| 1161 |
temp_file.close();
|
| 1162 |
+
std::string error_resp;
|
| 1163 |
+
if (!convert_to_wav(temp_filename, error_resp)) {
|
|
|
|
|
|
|
| 1164 |
res.set_content(error_resp, "application/json");
|
| 1165 |
return;
|
| 1166 |
}
|
| 1167 |
+
if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, params.diarize)) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1168 |
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
|
| 1169 |
res.set_content(error_resp, "application/json");
|
| 1170 |
std::remove(temp_filename.c_str());
|
| 1171 |
return;
|
| 1172 |
}
|
|
|
|
| 1173 |
std::remove(temp_filename.c_str());
|
| 1174 |
} else {
|
| 1175 |
+
if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize)) {
|
|
|
|
|
|
|
| 1176 |
const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
|
| 1177 |
res.set_content(error_resp, "application/json");
|
| 1178 |
return;
|
| 1179 |
}
|
| 1180 |
}
|
| 1181 |
|
| 1182 |
+
// create a temporary task id to run synchronously
|
| 1183 |
+
const std::string tmp_task_id = generate_task_id();
|
|
|
|
| 1184 |
{
|
| 1185 |
+
std::lock_guard<std::mutex> tlock(tasks_mutex);
|
| 1186 |
+
tasks[tmp_task_id] = async_task_t();
|
| 1187 |
+
tasks[tmp_task_id].status = async_status::PENDING;
|
| 1188 |
}
|
| 1189 |
|
| 1190 |
+
// run in same thread
|
| 1191 |
+
run_inference_task(tmp_task_id, params, std::move(pcmf32), std::move(pcmf32s), &req);
|
| 1192 |
+
|
| 1193 |
+
// return the stored result
|
| 1194 |
{
|
| 1195 |
+
std::lock_guard<std::mutex> tlock(tasks_mutex);
|
| 1196 |
+
if (tasks[tmp_task_id].status == async_status::FINISHED) {
|
| 1197 |
+
res.set_content(tasks[tmp_task_id].result, tasks[tmp_task_id].content_type);
|
| 1198 |
+
} else if (tasks[tmp_task_id].status == async_status::FAILED) {
|
| 1199 |
+
const std::string err = tasks[tmp_task_id].error.empty() ? "{\"error\":\"failed\"}" : tasks[tmp_task_id].error;
|
| 1200 |
+
res.set_content(err, "application/json");
|
| 1201 |
+
} else {
|
| 1202 |
+
res.set_content("{\"status\":\"processing\"}", "application/json");
|
| 1203 |
}
|
| 1204 |
+
tasks.erase(tmp_task_id);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1205 |
}
|
| 1206 |
+
});
|
| 1207 |
|
| 1208 |
+
// POST /inference_async -> enqueue background task and return task id
|
| 1209 |
+
svr->Post(sparams.request_path + "/inference_async", [&](const Request &req, Response &res){
|
| 1210 |
+
if (!req.has_file("file")) {
|
| 1211 |
+
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
|
| 1212 |
+
res.set_content(error_resp, "application/json");
|
| 1213 |
+
return;
|
| 1214 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1215 |
|
| 1216 |
+
// prepare params and audio buffers without holding model mutex for long
|
| 1217 |
+
whisper_params task_params = params; // copy default base
|
| 1218 |
+
get_req_parameters(req, task_params);
|
| 1219 |
|
| 1220 |
+
auto audio_file = req.get_file_value("file");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1221 |
|
| 1222 |
+
std::vector<float> pcmf32;
|
| 1223 |
+
std::vector<std::vector<float>> pcmf32s;
|
| 1224 |
|
| 1225 |
+
if (sparams.ffmpeg_converter) {
|
| 1226 |
+
const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
|
| 1227 |
+
std::ofstream temp_file{temp_filename, std::ios::binary};
|
| 1228 |
+
temp_file << audio_file.content;
|
| 1229 |
+
temp_file.close();
|
| 1230 |
+
std::string error_resp;
|
| 1231 |
+
if (!convert_to_wav(temp_filename, error_resp)) {
|
| 1232 |
+
res.set_content(error_resp, "application/json");
|
| 1233 |
+
return;
|
| 1234 |
}
|
| 1235 |
+
if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, task_params.diarize)) {
|
| 1236 |
+
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
|
| 1237 |
+
res.set_content(error_resp, "application/json");
|
| 1238 |
+
std::remove(temp_filename.c_str());
|
| 1239 |
+
return;
|
| 1240 |
}
|
| 1241 |
+
std::remove(temp_filename.c_str());
|
| 1242 |
+
} else {
|
| 1243 |
+
if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, task_params.diarize)) {
|
| 1244 |
+
const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1245 |
res.set_content(error_resp, "application/json");
|
| 1246 |
return;
|
| 1247 |
}
|
| 1248 |
}
|
| 1249 |
|
| 1250 |
+
// create task id and store placeholder
|
| 1251 |
+
const std::string task_id = generate_task_id();
|
| 1252 |
{
|
| 1253 |
+
std::lock_guard<std::mutex> tlock(tasks_mutex);
|
| 1254 |
+
tasks[task_id] = async_task_t();
|
| 1255 |
+
tasks[task_id].status = async_status::PENDING;
|
| 1256 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1257 |
|
| 1258 |
+
// spawn background worker thread
|
| 1259 |
+
std::thread worker([&, task_id, task_params, pcmf32 = std::move(pcmf32), pcmf32s = std::move(pcmf32s)]() mutable {
|
| 1260 |
+
// ensure only one inference runs at a time
|
| 1261 |
+
std::lock_guard<std::mutex> lock(whisper_mutex);
|
| 1262 |
+
// Do not pass pointer to the Request object into background thread - it will be out of scope
|
| 1263 |
+
run_inference_task(task_id, task_params, std::move(pcmf32), std::move(pcmf32s), nullptr);
|
| 1264 |
+
});
|
| 1265 |
+
worker.detach();
|
| 1266 |
+
|
| 1267 |
+
json j = json{{"task_id", task_id}};
|
| 1268 |
+
res.set_content(j.dump(), "application/json");
|
| 1269 |
+
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1270 |
|
| 1271 |
+
// GET /inference_result?id=<task_id> -> return status/result
|
| 1272 |
+
svr->Get(sparams.request_path + "/inference_result", [&](const Request &req, Response &res){
|
| 1273 |
+
if (!req.has_param("id")) {
|
| 1274 |
+
res.set_content("{\"error\":\"missing id parameter\"}", "application/json");
|
| 1275 |
+
return;
|
| 1276 |
+
}
|
| 1277 |
+
const std::string id = req.get_param_value("id");
|
| 1278 |
+
// copy needed data while holding lock, then release lock and send response
|
| 1279 |
+
std::string out_body;
|
| 1280 |
+
std::string out_ctype;
|
| 1281 |
+
{
|
| 1282 |
+
std::lock_guard<std::mutex> tlock(tasks_mutex);
|
| 1283 |
+
auto it = tasks.find(id);
|
| 1284 |
+
if (it == tasks.end()) {
|
| 1285 |
+
res.set_content("{\"error\":\"task not found\"}", "application/json");
|
| 1286 |
+
res.status = 404;
|
| 1287 |
+
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1288 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1289 |
|
| 1290 |
+
const auto & t = it->second;
|
| 1291 |
+
if (t.status == async_status::PENDING || t.status == async_status::RUNNING) {
|
| 1292 |
+
json j = { {"status", "processing"} };
|
| 1293 |
+
res.set_content(j.dump(), "application/json");
|
| 1294 |
+
return;
|
| 1295 |
+
}
|
| 1296 |
+
if (t.status == async_status::FAILED) {
|
| 1297 |
+
json j = { {"status", "failed"}, {"error", t.error} };
|
| 1298 |
+
out_body = j.dump();
|
| 1299 |
+
out_ctype = "application/json";
|
| 1300 |
+
// remove failed task from map to avoid accumulation
|
| 1301 |
+
tasks.erase(it);
|
| 1302 |
+
} else {
|
| 1303 |
+
// FINISHED: copy and erase the task so it's cleaned up after retrieval
|
| 1304 |
+
out_body = t.result;
|
| 1305 |
+
out_ctype = t.content_type;
|
| 1306 |
+
tasks.erase(it);
|
|
|
|
|
|
|
|
|
|
| 1307 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1308 |
}
|
| 1309 |
|
| 1310 |
+
res.set_content(out_body, out_ctype);
|
|
|
|
| 1311 |
});
|
| 1312 |
svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
|
| 1313 |
std::lock_guard<std::mutex> lock(whisper_mutex);
|