3v324v23 commited on
Commit
46ebeba
·
1 Parent(s): 6e115ac

add sync task

Browse files
Files changed (2) hide show
  1. Dockerfile +28 -0
  2. examples/server/server.cpp +470 -272
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ubuntu:22.04 AS build
2
+ WORKDIR /app
3
+
4
+ RUN apt-get update && \
5
+ apt-get install -y build-essential wget cmake git \
6
+ && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
7
+
8
+ COPY .. .
9
+ RUN make base.en
10
+
11
+ FROM ubuntu:22.04 AS runtime
12
+ WORKDIR /app
13
+
14
+ RUN apt-get update && \
15
+ apt-get install -y curl ffmpeg libsdl2-dev wget cmake git \
16
+ && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
17
+
18
+ COPY --from=build /app /app
19
+ ENV PATH=/app/build/bin:$PATH
20
+ # ENTRYPOINT [ "bash", "-c" ]
21
+
22
+ RUN mkdir /models
23
+ RUN ./models/download-ggml-model.sh large-v3 /models
24
+ RUN ./models/download-vad-model.sh silero-v5.1.2 /models
25
+
26
+ EXPOSE 7860
27
+
28
+ CMD [ "whisper-server --host 0.0.0.0 --port 7860 --vad -vm /models/ggml-silero-v5.1.2.bin -m /models/ggml-large-v3.bin" ]
examples/server/server.cpp CHANGED
@@ -19,6 +19,7 @@
19
  #include <atomic>
20
  #include <functional>
21
  #include <cstdlib>
 
22
  #if defined (_WIN32)
23
  #include <windows.h>
24
  #endif
@@ -610,6 +611,31 @@ void get_req_parameters(const Request & req, whisper_params & params)
610
 
611
  } // namespace
612
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
  int main(int argc, char ** argv) {
614
  ggml_backend_load_all();
615
 
@@ -719,27 +745,29 @@ int main(int argc, char ** argv) {
719
  <style>
720
  body {
721
  font-family: sans-serif;
 
 
 
722
  }
723
  form {
724
  display: flex;
725
  flex-direction: column;
726
  align-items: flex-start;
 
727
  }
728
  label {
729
  margin-bottom: 0.5rem;
730
  }
731
- input, select {
732
  margin-bottom: 1rem;
733
  }
734
- button {
735
- margin-top: 1rem;
736
- }
737
  </style>
738
  </head>
739
  <body>
740
  <h1>Whisper.cpp Server</h1>
741
 
742
- <h2>/inference</h2>
743
  <pre>
744
  curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
745
  -H "Content-Type: multipart/form-data" \
@@ -756,14 +784,11 @@ int main(int argc, char ** argv) {
756
  -F model="&lt;path-to-model-file&gt;"
757
  </pre>
758
 
759
- <div>
760
- <h2>Try it out</h2>
761
  <form action="/inference" method="POST" enctype="multipart/form-data">
762
  <label for="file">Choose an audio file:</label>
763
- <input type="file" id="file" name="file" accept="audio/*" required><br>
764
-
765
- <label for="temperature">Temperature:</label>
766
- <input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
767
 
768
  <label for="response_format">Response Format:</label>
769
  <select id="response_format" name="response_format">
@@ -772,11 +797,120 @@ int main(int argc, char ** argv) {
772
  <option value="text">Text</option>
773
  <option value="srt">SRT</option>
774
  <option value="vtt">VTT</option>
775
- </select><br>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
 
777
- <button type="submit">Submit</button>
778
  </form>
 
 
 
 
 
 
 
 
779
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
780
  </body>
781
  </html>
782
  )";
@@ -793,323 +927,387 @@ int main(int argc, char ** argv) {
793
  svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
794
  });
795
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796
  svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
797
- // acquire whisper model mutex lock
798
  std::lock_guard<std::mutex> lock(whisper_mutex);
799
 
800
- // first check user requested fields of the request
801
- if (!req.has_file("file"))
802
- {
803
- fprintf(stderr, "error: no 'file' field in the request\n");
804
  const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
805
  res.set_content(error_resp, "application/json");
806
  return;
807
  }
808
  auto audio_file = req.get_file_value("file");
809
-
810
- // check non-required fields
811
  get_req_parameters(req, params);
812
 
813
- std::string filename{audio_file.filename};
814
- printf("Received request: %s\n", filename.c_str());
815
-
816
- // audio arrays
817
- std::vector<float> pcmf32; // mono-channel F32 PCM
818
- std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
819
-
820
  if (sparams.ffmpeg_converter) {
821
- // if file is not wav, convert to wav
822
- // write to temporary file
823
  const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
824
  std::ofstream temp_file{temp_filename, std::ios::binary};
825
  temp_file << audio_file.content;
826
  temp_file.close();
827
-
828
- std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
829
- const bool is_converted = convert_to_wav(temp_filename, error_resp);
830
- if (!is_converted) {
831
  res.set_content(error_resp, "application/json");
832
  return;
833
  }
834
-
835
- // read audio content into pcmf32
836
- if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, params.diarize))
837
- {
838
- fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
839
  const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
840
  res.set_content(error_resp, "application/json");
841
  std::remove(temp_filename.c_str());
842
  return;
843
  }
844
- // remove temp file
845
  std::remove(temp_filename.c_str());
846
  } else {
847
- if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
848
- {
849
- fprintf(stderr, "error: failed to read audio data\n");
850
  const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
851
  res.set_content(error_resp, "application/json");
852
  return;
853
  }
854
  }
855
 
856
- printf("Successfully loaded %s\n", filename.c_str());
857
-
858
- // print system information
859
  {
860
- fprintf(stderr, "\n");
861
- fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
862
- params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
863
  }
864
 
865
- // print some info about the processing
 
 
 
866
  {
867
- fprintf(stderr, "\n");
868
- if (!whisper_is_multilingual(ctx)) {
869
- if (params.language != "en" || params.translate) {
870
- params.language = "en";
871
- params.translate = false;
872
- fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
873
- }
 
874
  }
875
- if (params.detect_language) {
876
- params.language = "auto";
877
- }
878
- fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
879
- __func__, filename.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
880
- params.n_threads, params.n_processors,
881
- params.language.c_str(),
882
- params.translate ? "translate" : "transcribe",
883
- params.tinydiarize ? "tdrz = 1, " : "",
884
- params.no_timestamps ? 0 : 1);
885
-
886
- fprintf(stderr, "\n");
887
  }
 
888
 
889
- // run the inference
890
- {
891
- printf("Running whisper.cpp inference on %s\n", filename.c_str());
892
- whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
893
-
894
- wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
895
-
896
- wparams.print_realtime = false;
897
- wparams.print_progress = params.print_progress;
898
- wparams.print_timestamps = !params.no_timestamps;
899
- wparams.print_special = params.print_special;
900
- wparams.translate = params.translate;
901
- wparams.language = params.language.c_str();
902
- wparams.detect_language = params.detect_language;
903
- wparams.n_threads = params.n_threads;
904
- wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
905
- wparams.offset_ms = params.offset_t_ms;
906
- wparams.duration_ms = params.duration_ms;
907
-
908
- wparams.thold_pt = params.word_thold;
909
- wparams.max_len = params.max_len == 0 ? 60 : params.max_len;
910
- wparams.split_on_word = params.split_on_word;
911
- wparams.audio_ctx = params.audio_ctx;
912
-
913
- wparams.debug_mode = params.debug_mode;
914
-
915
- wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
916
-
917
- wparams.initial_prompt = params.prompt.c_str();
918
-
919
- wparams.greedy.best_of = params.best_of;
920
- wparams.beam_search.beam_size = params.beam_size;
921
-
922
- wparams.temperature = params.temperature;
923
- wparams.no_speech_thold = params.no_speech_thold;
924
- wparams.temperature_inc = params.temperature_inc;
925
- wparams.entropy_thold = params.entropy_thold;
926
- wparams.logprob_thold = params.logprob_thold;
927
-
928
- wparams.no_timestamps = params.no_timestamps;
929
- wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
930
- wparams.no_context = params.no_context;
931
-
932
- wparams.suppress_nst = params.suppress_nst;
933
 
934
- wparams.vad = params.vad;
935
- wparams.vad_model_path = params.vad_model.c_str();
 
936
 
937
- wparams.vad_params.threshold = params.vad_threshold;
938
- wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
939
- wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
940
- wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
941
- wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
942
- wparams.vad_params.samples_overlap = params.vad_samples_overlap;
943
 
944
- whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
 
945
 
946
- // this callback is called on each new segment
947
- if (params.print_realtime) {
948
- wparams.new_segment_callback = whisper_print_segment_callback;
949
- wparams.new_segment_callback_user_data = &user_data;
 
 
 
 
 
950
  }
951
-
952
- if (wparams.print_progress) {
953
- wparams.progress_callback = whisper_print_progress_callback;
954
- wparams.progress_callback_user_data = &user_data;
 
955
  }
956
-
957
- // tell whisper to abort if the HTTP connection closed
958
- wparams.abort_callback = [](void *user_data) {
959
- // user_data is a pointer to our Request
960
- auto req_ptr = static_cast<const httplib::Request*>(user_data);
961
- return req_ptr->is_connection_closed();
962
- };
963
- wparams.abort_callback_user_data = (void*)&req;
964
-
965
- if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
966
- // handle failure or early abort
967
- if (req.is_connection_closed()) {
968
- // log client disconnect
969
- fprintf(stderr, "client disconnected, aborted processing\n");
970
- res.status = 499; // Client Closed Request (nginx convention)
971
- res.set_content("{\"error\":\"client disconnected\"}", "application/json");
972
- return;
973
- }
974
- fprintf(stderr, "%s: failed to process audio\n", argv[0]);
975
- res.status = 500; // Internal Server Error
976
- const std::string error_resp = "{\"error\":\"failed to process audio\"}";
977
  res.set_content(error_resp, "application/json");
978
  return;
979
  }
980
  }
981
 
982
- // return results to user
983
- if (params.response_format == text_format)
984
  {
985
- std::string results = output_str(ctx, params, pcmf32s);
986
- res.set_content(results.c_str(), "text/html; charset=utf-8");
 
987
  }
988
- else if (params.response_format == srt_format)
989
- {
990
- std::stringstream ss;
991
- const int n_segments = whisper_full_n_segments(ctx);
992
- for (int i = 0; i < n_segments; ++i) {
993
- const char * text = whisper_full_get_segment_text(ctx, i);
994
- const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
995
- const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
996
- std::string speaker = "";
997
-
998
- if (params.diarize && pcmf32s.size() == 2)
999
- {
1000
- speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
1001
- }
1002
 
1003
- ss << i + 1 + params.offset_n << "\n";
1004
- ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
1005
- ss << speaker << text << "\n\n";
1006
- }
1007
- res.set_content(ss.str(), "application/x-subrip");
1008
- } else if (params.response_format == vtt_format) {
1009
- std::stringstream ss;
1010
-
1011
- ss << "WEBVTT\n\n";
1012
-
1013
- const int n_segments = whisper_full_n_segments(ctx);
1014
- for (int i = 0; i < n_segments; ++i) {
1015
- const char * text = whisper_full_get_segment_text(ctx, i);
1016
- const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
1017
- const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
1018
- std::string speaker = "";
1019
-
1020
- if (params.diarize && pcmf32s.size() == 2)
1021
- {
1022
- speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
1023
- speaker.insert(0, "<v Speaker");
1024
- speaker.append(">");
1025
- }
1026
 
1027
- ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
1028
- ss << speaker << text << "\n\n";
1029
- }
1030
- res.set_content(ss.str(), "text/vtt");
1031
- } else if (params.response_format == vjson_format) {
1032
- /* try to match openai/whisper's Python format */
1033
- std::string results = output_str(ctx, params, pcmf32s);
1034
- json jres = json{
1035
- {"task", params.translate ? "translate" : "transcribe"},
1036
- {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
1037
- {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
1038
- {"text", results},
1039
- {"segments", json::array()}
1040
- };
1041
- // Only compute language probabilities if requested (expensive operation)
1042
- if (!params.no_language_probabilities) {
1043
- std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
1044
- const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
1045
- jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
1046
- jres["detected_language_probability"] = lang_probs[detected_lang_id];
1047
- jres["language_probabilities"] = json::object();
1048
- // Add all language probabilities
1049
- for (int i = 0; i <= whisper_lang_max_id(); ++i) {
1050
- if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities
1051
- jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
1052
- }
1053
- }
1054
  }
1055
- const int n_segments = whisper_full_n_segments(ctx);
1056
- for (int i = 0; i < n_segments; ++i)
1057
- {
1058
- json segment = json{
1059
- {"id", i},
1060
- {"text", whisper_full_get_segment_text(ctx, i)},
1061
- };
1062
-
1063
- if (!params.no_timestamps) {
1064
- segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
1065
- segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
1066
- }
1067
-
1068
- float total_logprob = 0;
1069
- const int n_tokens = whisper_full_n_tokens(ctx, i);
1070
- for (int j = 0; j < n_tokens; ++j) {
1071
- whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
1072
- if (token.id >= whisper_token_eot(ctx)) {
1073
- continue;
1074
- }
1075
 
1076
- segment["tokens"].push_back(token.id);
1077
- json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
1078
- if (!params.no_timestamps) {
1079
- word["start"] = token.t0 * 0.01;
1080
- word["end"] = token.t1 * 0.01;
1081
- word["t_dtw"] = token.t_dtw;
1082
- }
1083
- word["probability"] = token.p;
1084
- total_logprob += token.plog;
1085
- segment["words"].push_back(word);
1086
- }
1087
-
1088
- segment["temperature"] = params.temperature;
1089
- segment["avg_logprob"] = total_logprob / n_tokens;
1090
-
1091
- // TODO compression_ratio and no_speech_prob are not implemented yet
1092
- // segment["compression_ratio"] = 0;
1093
- segment["no_speech_prob"] = whisper_full_get_segment_no_speech_prob(ctx, i);
1094
-
1095
- jres["segments"].push_back(segment);
1096
  }
1097
- res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
1098
- "application/json");
1099
- }
1100
- // TODO add more output formats
1101
- else
1102
- {
1103
- std::string results = output_str(ctx, params, pcmf32s);
1104
- json jres = json{
1105
- {"text", results}
1106
- };
1107
- res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
1108
- "application/json");
1109
  }
1110
 
1111
- // reset params to their defaults
1112
- params = default_params;
1113
  });
1114
  svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
1115
  std::lock_guard<std::mutex> lock(whisper_mutex);
 
19
  #include <atomic>
20
  #include <functional>
21
  #include <cstdlib>
22
+ #include <unordered_map>
23
  #if defined (_WIN32)
24
  #include <windows.h>
25
  #endif
 
611
 
612
  } // namespace
613
 
614
+ // Async task management
615
+ namespace {
616
+ enum class async_status { PENDING, RUNNING, FINISHED, FAILED };
617
+
618
+ struct async_task_t {
619
+ async_status status = async_status::PENDING;
620
+ std::string result; // final response body
621
+ std::string content_type = "application/json";
622
+ std::string error; // error message if failed
623
+ };
624
+
625
+ static std::unordered_map<std::string, async_task_t> tasks;
626
+ static std::mutex tasks_mutex;
627
+ static std::atomic<uint64_t> task_counter{0};
628
+
629
+ std::string generate_task_id() {
630
+ const uint64_t id = ++task_counter;
631
+ auto now = std::chrono::system_clock::now();
632
+ auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
633
+ std::stringstream ss;
634
+ ss << id << "-" << now_ms;
635
+ return ss.str();
636
+ }
637
+ }
638
+
639
  int main(int argc, char ** argv) {
640
  ggml_backend_load_all();
641
 
 
745
  <style>
746
  body {
747
  font-family: sans-serif;
748
+ max-width: 900px;
749
+ margin: 1rem auto;
750
+ padding: 0 1rem;
751
  }
752
  form {
753
  display: flex;
754
  flex-direction: column;
755
  align-items: flex-start;
756
+ margin-bottom: 1.5rem;
757
  }
758
  label {
759
  margin-bottom: 0.5rem;
760
  }
761
+ input, select, button, textarea {
762
  margin-bottom: 1rem;
763
  }
764
+ .box { border: 1px solid #ddd; padding: 1rem; border-radius: 6px; }
 
 
765
  </style>
766
  </head>
767
  <body>
768
  <h1>Whisper.cpp Server</h1>
769
 
770
+ <h2>/inference (同步示例)</h2>
771
  <pre>
772
  curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
773
  -H "Content-Type: multipart/form-data" \
 
784
  -F model="&lt;path-to-model-file&gt;"
785
  </pre>
786
 
787
+ <div class="box">
788
+ <h2>同步 Try it out</h2>
789
  <form action="/inference" method="POST" enctype="multipart/form-data">
790
  <label for="file">Choose an audio file:</label>
791
+ <input type="file" id="file" name="file" accept="audio/*" required>
 
 
 
792
 
793
  <label for="response_format">Response Format:</label>
794
  <select id="response_format" name="response_format">
 
797
  <option value="text">Text</option>
798
  <option value="srt">SRT</option>
799
  <option value="vtt">VTT</option>
800
+ </select>
801
+
802
+ <button type="submit">Run synchronous inference</button>
803
+ </form>
804
+ </div>
805
+
806
+ <div class="box">
807
+ <h2>异步示例(可运行)</h2>
808
+ <form id="async-form" enctype="multipart/form-data">
809
+ <label for="afile">Choose an audio file:</label>
810
+ <input type="file" id="afile" name="file" accept="audio/*" required>
811
+
812
+ <label for="a_response_format">Response Format:</label>
813
+ <select id="a_response_format" name="response_format">
814
+ <option value="json">JSON</option>
815
+ <option value="text">Text</option>
816
+ <option value="srt">SRT</option>
817
+ <option value="vtt">VTT</option>
818
+ <option value="verbose_json">Verbose JSON</option>
819
+ </select>
820
 
821
+ <button id="async-submit" type="button">Submit async job</button>
822
  </form>
823
+
824
+ <div>
825
+ <h3>Task</h3>
826
+ <div id="task-id">(no task)</div>
827
+
828
+ <h3>Result / Status</h3>
829
+ <pre id="task-result" style="white-space:pre-wrap; background:#f7f7f7; padding:0.5rem; border-radius:4px;"></pre>
830
+ </div>
831
  </div>
832
+
833
+ <script>
834
+ (function(){
835
+ const submitBtn = document.getElementById('async-submit');
836
+ const form = document.getElementById('async-form');
837
+ const taskIdEl = document.getElementById('task-id');
838
+ const resultEl = document.getElementById('task-result');
839
+ let currentTask = null;
840
+
841
+ function sleep(ms){ return new Promise(r=>setTimeout(r, ms)); }
842
+
843
+ async function pollTask(id){
844
+ taskIdEl.textContent = id;
845
+ resultEl.textContent = 'processing...';
846
+ while (true) {
847
+ try {
848
+ const resp = await fetch('/inference_result?id=' + encodeURIComponent(id));
849
+ if (resp.status === 404) {
850
+ resultEl.textContent = 'task not found';
851
+ return;
852
+ }
853
+ const ctype = resp.headers.get('content-type') || '';
854
+ if (ctype.indexOf('application/json') !== -1) {
855
+ const j = await resp.json();
856
+ // if processing status, continue polling
857
+ if (j && j.status && (j.status === 'processing')) {
858
+ await sleep(1000);
859
+ continue;
860
+ }
861
+ resultEl.textContent = JSON.stringify(j, null, 2);
862
+ return;
863
+ } else {
864
+ // non-json (final text or srt/vtt)
865
+ const txt = await resp.text();
866
+ // if it's the processing JSON returned with application/json, handle above
867
+ if (txt && txt.indexOf('{"status":"processing"') !== -1) {
868
+ await sleep(1000);
869
+ continue;
870
+ }
871
+ resultEl.textContent = txt;
872
+ return;
873
+ }
874
+ } catch (err) {
875
+ resultEl.textContent = 'error: ' + err.message;
876
+ return;
877
+ }
878
+ }
879
+ }
880
+
881
+ submitBtn.addEventListener('click', async function(){
882
+ resultEl.textContent = '';
883
+ const fileInput = document.getElementById('afile');
884
+ if (!fileInput.files || fileInput.files.length === 0) {
885
+ alert('Please choose a file');
886
+ return;
887
+ }
888
+ const fd = new FormData();
889
+ fd.append('file', fileInput.files[0]);
890
+ fd.append('response_format', document.getElementById('a_response_format').value);
891
+
892
+ submitBtn.disabled = true;
893
+ submitBtn.textContent = 'Submitting...';
894
+
895
+ try {
896
+ const resp = await fetch('/inference_async', { method: 'POST', body: fd });
897
+ const j = await resp.json();
898
+ if (j && j.task_id) {
899
+ currentTask = j.task_id;
900
+ pollTask(currentTask);
901
+ } else {
902
+ resultEl.textContent = 'invalid response: ' + JSON.stringify(j);
903
+ }
904
+ } catch (err) {
905
+ resultEl.textContent = 'submit error: ' + err.message;
906
+ } finally {
907
+ submitBtn.disabled = false;
908
+ submitBtn.textContent = 'Submit async job';
909
+ }
910
+ });
911
+ })();
912
+ </script>
913
+
914
  </body>
915
  </html>
916
  )";
 
927
  svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
928
  });
929
 
930
+ // Helper: run inference for a prepared audio buffer and params, store response in task
931
+ auto run_inference_task = [&](const std::string & task_id,
932
+ whisper_params task_params,
933
+ std::vector<float> pcmf32,
934
+ std::vector<std::vector<float>> pcmf32s,
935
+ const Request * orig_req) {
936
+ {
937
+ std::lock_guard<std::mutex> tlock(tasks_mutex);
938
+ tasks[task_id].status = async_status::RUNNING;
939
+ }
940
+
941
+ try {
942
+ // set up whisper params
943
+ whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
944
+
945
+ wparams.strategy = task_params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
946
+
947
+ wparams.print_realtime = false;
948
+ wparams.print_progress = task_params.print_progress;
949
+ wparams.print_timestamps = !task_params.no_timestamps;
950
+ wparams.print_special = task_params.print_special;
951
+ wparams.translate = task_params.translate;
952
+ wparams.language = task_params.language.c_str();
953
+ wparams.detect_language = task_params.detect_language;
954
+ wparams.n_threads = task_params.n_threads;
955
+ wparams.n_max_text_ctx = task_params.max_context >= 0 ? task_params.max_context : wparams.n_max_text_ctx;
956
+ wparams.offset_ms = task_params.offset_t_ms;
957
+ wparams.duration_ms = task_params.duration_ms;
958
+
959
+ wparams.thold_pt = task_params.word_thold;
960
+ wparams.max_len = task_params.max_len == 0 ? 60 : task_params.max_len;
961
+ wparams.split_on_word = task_params.split_on_word;
962
+ wparams.audio_ctx = task_params.audio_ctx;
963
+
964
+ wparams.debug_mode = task_params.debug_mode;
965
+
966
+ wparams.tdrz_enable = task_params.tinydiarize; // [TDRZ]
967
+
968
+ wparams.initial_prompt = task_params.prompt.c_str();
969
+
970
+ wparams.greedy.best_of = task_params.best_of;
971
+ wparams.beam_search.beam_size = task_params.beam_size;
972
+
973
+ wparams.temperature = task_params.temperature;
974
+ wparams.no_speech_thold = task_params.no_speech_thold;
975
+ wparams.temperature_inc = task_params.temperature_inc;
976
+ wparams.entropy_thold = task_params.entropy_thold;
977
+ wparams.logprob_thold = task_params.logprob_thold;
978
+
979
+ wparams.no_timestamps = task_params.no_timestamps;
980
+ wparams.token_timestamps = !task_params.no_timestamps && task_params.response_format == vjson_format;
981
+ wparams.no_context = task_params.no_context;
982
+
983
+ wparams.suppress_nst = task_params.suppress_nst;
984
+
985
+ wparams.vad = task_params.vad;
986
+ wparams.vad_model_path = task_params.vad_model.c_str();
987
+
988
+ wparams.vad_params.threshold = task_params.vad_threshold;
989
+ wparams.vad_params.min_speech_duration_ms = task_params.vad_min_speech_duration_ms;
990
+ wparams.vad_params.min_silence_duration_ms = task_params.vad_min_silence_duration_ms;
991
+ wparams.vad_params.max_speech_duration_s = task_params.vad_max_speech_duration_s;
992
+ wparams.vad_params.speech_pad_ms = task_params.vad_speech_pad_ms;
993
+ wparams.vad_params.samples_overlap = task_params.vad_samples_overlap;
994
+
995
+ whisper_print_user_data user_data = { &task_params, &pcmf32s, 0 };
996
+
997
+ if (task_params.print_realtime) {
998
+ wparams.new_segment_callback = whisper_print_segment_callback;
999
+ wparams.new_segment_callback_user_data = &user_data;
1000
+ }
1001
+
1002
+ if (wparams.print_progress) {
1003
+ wparams.progress_callback = whisper_print_progress_callback;
1004
+ wparams.progress_callback_user_data = &user_data;
1005
+ }
1006
+
1007
+ // abort callback uses original request pointer if provided
1008
+ // ggml_abort_callback expects a function returning bool
1009
+ wparams.abort_callback = [](void *user_data)->bool {
1010
+ if (!user_data) return false;
1011
+ auto req_ptr = static_cast<const httplib::Request*>(user_data);
1012
+ return req_ptr->is_connection_closed();
1013
+ };
1014
+ wparams.abort_callback_user_data = (void*)orig_req;
1015
+
1016
+ if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), task_params.n_processors) != 0) {
1017
+ // failure
1018
+ std::lock_guard<std::mutex> tlock(tasks_mutex);
1019
+ tasks[task_id].status = async_status::FAILED;
1020
+ tasks[task_id].error = "failed to process audio";
1021
+ tasks[task_id].result = "";
1022
+ return;
1023
+ }
1024
+
1025
+ // prepare response according to format
1026
+ std::string content;
1027
+ std::string ctype = "application/json";
1028
+ if (task_params.response_format == text_format) {
1029
+ content = output_str(ctx, task_params, pcmf32s);
1030
+ ctype = "text/plain; charset=utf-8";
1031
+ } else if (task_params.response_format == srt_format) {
1032
+ std::stringstream ss;
1033
+ const int n_segments = whisper_full_n_segments(ctx);
1034
+ for (int i = 0; i < n_segments; ++i) {
1035
+ const char * text = whisper_full_get_segment_text(ctx, i);
1036
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
1037
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
1038
+ std::string speaker = "";
1039
+ if (task_params.diarize && pcmf32s.size() == 2) {
1040
+ speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
1041
+ }
1042
+ ss << i + 1 + task_params.offset_n << "\n";
1043
+ ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
1044
+ ss << speaker << text << "\n\n";
1045
+ }
1046
+ content = ss.str();
1047
+ ctype = "application/x-subrip";
1048
+ } else if (task_params.response_format == vtt_format) {
1049
+ std::stringstream ss;
1050
+ ss << "WEBVTT\n\n";
1051
+ const int n_segments = whisper_full_n_segments(ctx);
1052
+ for (int i = 0; i < n_segments; ++i) {
1053
+ const char * text = whisper_full_get_segment_text(ctx, i);
1054
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
1055
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
1056
+ std::string speaker = "";
1057
+ if (task_params.diarize && pcmf32s.size() == 2) {
1058
+ speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
1059
+ speaker.insert(0, "<v Speaker");
1060
+ speaker.append(">");
1061
+ }
1062
+ ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
1063
+ ss << speaker << text << "\n\n";
1064
+ }
1065
+ content = ss.str();
1066
+ ctype = "text/vtt";
1067
+ } else if (task_params.response_format == vjson_format) {
1068
+ std::string results = output_str(ctx, task_params, pcmf32s);
1069
+ json jres = json{
1070
+ {"task", task_params.translate ? "translate" : "transcribe"},
1071
+ {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
1072
+ {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
1073
+ {"text", results},
1074
+ {"segments", json::array()}
1075
+ };
1076
+ if (!task_params.no_language_probabilities) {
1077
+ std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
1078
+ const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, task_params.n_threads, lang_probs.data());
1079
+ jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
1080
+ jres["detected_language_probability"] = lang_probs[detected_lang_id];
1081
+ jres["language_probabilities"] = json::object();
1082
+ for (int i = 0; i <= whisper_lang_max_id(); ++i) {
1083
+ if (lang_probs[i] > 0.001f) {
1084
+ jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
1085
+ }
1086
+ }
1087
+ }
1088
+ const int n_segments = whisper_full_n_segments(ctx);
1089
+ for (int i = 0; i < n_segments; ++i) {
1090
+ json segment = json{{"id", i}, {"text", whisper_full_get_segment_text(ctx, i)}};
1091
+ if (!task_params.no_timestamps) {
1092
+ segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
1093
+ segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
1094
+ }
1095
+ float total_logprob = 0;
1096
+ const int n_tokens = whisper_full_n_tokens(ctx, i);
1097
+ for (int j = 0; j < n_tokens; ++j) {
1098
+ whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
1099
+ if (token.id >= whisper_token_eot(ctx)) continue;
1100
+ segment["tokens"].push_back(token.id);
1101
+ json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
1102
+ if (!task_params.no_timestamps) {
1103
+ word["start"] = token.t0 * 0.01;
1104
+ word["end"] = token.t1 * 0.01;
1105
+ word["t_dtw"] = token.t_dtw;
1106
+ }
1107
+ word["probability"] = token.p;
1108
+ total_logprob += token.plog;
1109
+ segment["words"].push_back(word);
1110
+ }
1111
+ segment["temperature"] = task_params.temperature;
1112
+ segment["avg_logprob"] = total_logprob / n_tokens;
1113
+ segment["no_speech_prob"] = whisper_full_get_segment_no_speech_prob(ctx, i);
1114
+ jres["segments"].push_back(segment);
1115
+ }
1116
+ content = jres.dump(-1, ' ', false, json::error_handler_t::replace);
1117
+ ctype = "application/json";
1118
+ } else {
1119
+ std::string results = output_str(ctx, task_params, pcmf32s);
1120
+ json jres = json{{"text", results}};
1121
+ content = jres.dump(-1, ' ', false, json::error_handler_t::replace);
1122
+ ctype = "application/json";
1123
+ }
1124
+
1125
+ // store result
1126
+ {
1127
+ std::lock_guard<std::mutex> tlock(tasks_mutex);
1128
+ tasks[task_id].status = async_status::FINISHED;
1129
+ tasks[task_id].result = content;
1130
+ tasks[task_id].content_type = ctype;
1131
+ }
1132
+
1133
+ } catch (const std::exception &e) {
1134
+ std::lock_guard<std::mutex> tlock(tasks_mutex);
1135
+ tasks[task_id].status = async_status::FAILED;
1136
+ tasks[task_id].error = e.what();
1137
+ tasks[task_id].result.clear();
1138
+ }
1139
+ };
1140
+
1141
+ // Synchronous inference kept for compatibility at original path
1142
  svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
1143
+ // existing synchronous behavior: simply call async helper synchronously while holding mutex
1144
  std::lock_guard<std::mutex> lock(whisper_mutex);
1145
 
1146
+ if (!req.has_file("file")) {
 
 
 
1147
  const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
1148
  res.set_content(error_resp, "application/json");
1149
  return;
1150
  }
1151
  auto audio_file = req.get_file_value("file");
1152
+ // gather parameters
 
1153
  get_req_parameters(req, params);
1154
 
1155
+ std::vector<float> pcmf32;
1156
+ std::vector<std::vector<float>> pcmf32s;
 
 
 
 
 
1157
  if (sparams.ffmpeg_converter) {
 
 
1158
  const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
1159
  std::ofstream temp_file{temp_filename, std::ios::binary};
1160
  temp_file << audio_file.content;
1161
  temp_file.close();
1162
+ std::string error_resp;
1163
+ if (!convert_to_wav(temp_filename, error_resp)) {
 
 
1164
  res.set_content(error_resp, "application/json");
1165
  return;
1166
  }
1167
+ if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, params.diarize)) {
 
 
 
 
1168
  const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
1169
  res.set_content(error_resp, "application/json");
1170
  std::remove(temp_filename.c_str());
1171
  return;
1172
  }
 
1173
  std::remove(temp_filename.c_str());
1174
  } else {
1175
+ if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize)) {
 
 
1176
  const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
1177
  res.set_content(error_resp, "application/json");
1178
  return;
1179
  }
1180
  }
1181
 
1182
+ // create a temporary task id to run synchronously
1183
+ const std::string tmp_task_id = generate_task_id();
 
1184
  {
1185
+ std::lock_guard<std::mutex> tlock(tasks_mutex);
1186
+ tasks[tmp_task_id] = async_task_t();
1187
+ tasks[tmp_task_id].status = async_status::PENDING;
1188
  }
1189
 
1190
+ // run in same thread
1191
+ run_inference_task(tmp_task_id, params, std::move(pcmf32), std::move(pcmf32s), &req);
1192
+
1193
+ // return the stored result
1194
  {
1195
+ std::lock_guard<std::mutex> tlock(tasks_mutex);
1196
+ if (tasks[tmp_task_id].status == async_status::FINISHED) {
1197
+ res.set_content(tasks[tmp_task_id].result, tasks[tmp_task_id].content_type);
1198
+ } else if (tasks[tmp_task_id].status == async_status::FAILED) {
1199
+ const std::string err = tasks[tmp_task_id].error.empty() ? "{\"error\":\"failed\"}" : tasks[tmp_task_id].error;
1200
+ res.set_content(err, "application/json");
1201
+ } else {
1202
+ res.set_content("{\"status\":\"processing\"}", "application/json");
1203
  }
1204
+ tasks.erase(tmp_task_id);
 
 
 
 
 
 
 
 
 
 
 
1205
  }
1206
+ });
1207
 
1208
+ // POST /inference_async -> enqueue background task and return task id
1209
+ svr->Post(sparams.request_path + "/inference_async", [&](const Request &req, Response &res){
1210
+ if (!req.has_file("file")) {
1211
+ const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
1212
+ res.set_content(error_resp, "application/json");
1213
+ return;
1214
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1215
 
1216
+ // prepare params and audio buffers without holding model mutex for long
1217
+ whisper_params task_params = params; // copy default base
1218
+ get_req_parameters(req, task_params);
1219
 
1220
+ auto audio_file = req.get_file_value("file");
 
 
 
 
 
1221
 
1222
+ std::vector<float> pcmf32;
1223
+ std::vector<std::vector<float>> pcmf32s;
1224
 
1225
+ if (sparams.ffmpeg_converter) {
1226
+ const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
1227
+ std::ofstream temp_file{temp_filename, std::ios::binary};
1228
+ temp_file << audio_file.content;
1229
+ temp_file.close();
1230
+ std::string error_resp;
1231
+ if (!convert_to_wav(temp_filename, error_resp)) {
1232
+ res.set_content(error_resp, "application/json");
1233
+ return;
1234
  }
1235
+ if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, task_params.diarize)) {
1236
+ const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
1237
+ res.set_content(error_resp, "application/json");
1238
+ std::remove(temp_filename.c_str());
1239
+ return;
1240
  }
1241
+ std::remove(temp_filename.c_str());
1242
+ } else {
1243
+ if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, task_params.diarize)) {
1244
+ const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1245
  res.set_content(error_resp, "application/json");
1246
  return;
1247
  }
1248
  }
1249
 
1250
+ // create task id and store placeholder
1251
+ const std::string task_id = generate_task_id();
1252
  {
1253
+ std::lock_guard<std::mutex> tlock(tasks_mutex);
1254
+ tasks[task_id] = async_task_t();
1255
+ tasks[task_id].status = async_status::PENDING;
1256
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1257
 
1258
+ // spawn background worker thread
1259
+ std::thread worker([&, task_id, task_params, pcmf32 = std::move(pcmf32), pcmf32s = std::move(pcmf32s)]() mutable {
1260
+ // ensure only one inference runs at a time
1261
+ std::lock_guard<std::mutex> lock(whisper_mutex);
1262
+ // Do not pass pointer to the Request object into background thread - it will be out of scope
1263
+ run_inference_task(task_id, task_params, std::move(pcmf32), std::move(pcmf32s), nullptr);
1264
+ });
1265
+ worker.detach();
1266
+
1267
+ json j = json{{"task_id", task_id}};
1268
+ res.set_content(j.dump(), "application/json");
1269
+ });
 
 
 
 
 
 
 
 
 
 
 
1270
 
1271
+ // GET /inference_result?id=<task_id> -> return status/result
1272
+ svr->Get(sparams.request_path + "/inference_result", [&](const Request &req, Response &res){
1273
+ if (!req.has_param("id")) {
1274
+ res.set_content("{\"error\":\"missing id parameter\"}", "application/json");
1275
+ return;
1276
+ }
1277
+ const std::string id = req.get_param_value("id");
1278
+ // copy needed data while holding lock, then release lock and send response
1279
+ std::string out_body;
1280
+ std::string out_ctype;
1281
+ {
1282
+ std::lock_guard<std::mutex> tlock(tasks_mutex);
1283
+ auto it = tasks.find(id);
1284
+ if (it == tasks.end()) {
1285
+ res.set_content("{\"error\":\"task not found\"}", "application/json");
1286
+ res.status = 404;
1287
+ return;
 
 
 
 
 
 
 
 
 
 
1288
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1289
 
1290
+ const auto & t = it->second;
1291
+ if (t.status == async_status::PENDING || t.status == async_status::RUNNING) {
1292
+ json j = { {"status", "processing"} };
1293
+ res.set_content(j.dump(), "application/json");
1294
+ return;
1295
+ }
1296
+ if (t.status == async_status::FAILED) {
1297
+ json j = { {"status", "failed"}, {"error", t.error} };
1298
+ out_body = j.dump();
1299
+ out_ctype = "application/json";
1300
+ // remove failed task from map to avoid accumulation
1301
+ tasks.erase(it);
1302
+ } else {
1303
+ // FINISHED: copy and erase the task so it's cleaned up after retrieval
1304
+ out_body = t.result;
1305
+ out_ctype = t.content_type;
1306
+ tasks.erase(it);
 
 
 
1307
  }
 
 
 
 
 
 
 
 
 
 
 
 
1308
  }
1309
 
1310
+ res.set_content(out_body, out_ctype);
 
1311
  });
1312
  svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
1313
  std::lock_guard<std::mutex> lock(whisper_mutex);