Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

3v324v23 commited on Sep 26

Commit

46ebeba

1 Parent(s): 6e115ac

add sync task

Browse files

Files changed (2) hide show

Dockerfile +28 -0
examples/server/server.cpp +470 -272

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM ubuntu:22.04 AS build
+WORKDIR /app
+RUN apt-get update && \
+  apt-get install -y build-essential wget cmake git \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+COPY .. .
+RUN make base.en
+FROM ubuntu:22.04 AS runtime
+WORKDIR /app
+RUN apt-get update && \
+  apt-get install -y curl ffmpeg libsdl2-dev wget cmake git \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+COPY --from=build /app /app
+ENV PATH=/app/build/bin:$PATH
+# ENTRYPOINT [ "bash", "-c" ]
+RUN mkdir /models
+RUN ./models/download-ggml-model.sh large-v3 /models
+RUN ./models/download-vad-model.sh  silero-v5.1.2 /models
+EXPOSE 7860
+CMD [ "whisper-server --host 0.0.0.0 --port 7860 --vad -vm /models/ggml-silero-v5.1.2.bin -m /models/ggml-large-v3.bin" ]

examples/server/server.cpp CHANGED Viewed

@@ -19,6 +19,7 @@
 #include <atomic>
 #include <functional>
 #include <cstdlib>
 #if defined (_WIN32)
 #include <windows.h>
 #endif
@@ -610,6 +611,31 @@ void get_req_parameters(const Request & req, whisper_params & params)
 }  // namespace
 int main(int argc, char ** argv) {
     ggml_backend_load_all();
@@ -719,27 +745,29 @@ int main(int argc, char ** argv) {
         <style>
         body {
             font-family: sans-serif;
         }
         form {
             display: flex;
             flex-direction: column;
             align-items: flex-start;
         }
         label {
             margin-bottom: 0.5rem;
         }
-        input, select {
             margin-bottom: 1rem;
         }
-        button {
-            margin-top: 1rem;
-        }
         </style>
     </head>
     <body>
         <h1>Whisper.cpp Server</h1>
-        <h2>/inference</h2>
         <pre>
     curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
     -H "Content-Type: multipart/form-data" \
@@ -756,14 +784,11 @@ int main(int argc, char ** argv) {
     -F model="&lt;path-to-model-file&gt;"
         </pre>
-        <div>
-            <h2>Try it out</h2>
             <form action="/inference" method="POST" enctype="multipart/form-data">
                 <label for="file">Choose an audio file:</label>
-                <input type="file" id="file" name="file" accept="audio/*" required><br>
-                <label for="temperature">Temperature:</label>
-                <input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
                 <label for="response_format">Response Format:</label>
                 <select id="response_format" name="response_format">
@@ -772,11 +797,120 @@ int main(int argc, char ** argv) {
                     <option value="text">Text</option>
                     <option value="srt">SRT</option>
                     <option value="vtt">VTT</option>
-                </select><br>
-                <button type="submit">Submit</button>
             </form>
         </div>
     </body>
     </html>
     )";
@@ -793,323 +927,387 @@ int main(int argc, char ** argv) {
     svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
     });
     svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
-        // acquire whisper model mutex lock
         std::lock_guard<std::mutex> lock(whisper_mutex);
-        // first check user requested fields of the request
-        if (!req.has_file("file"))
-        {
-            fprintf(stderr, "error: no 'file' field in the request\n");
             const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
             res.set_content(error_resp, "application/json");
             return;
         }
         auto audio_file = req.get_file_value("file");
-        // check non-required fields
         get_req_parameters(req, params);
-        std::string filename{audio_file.filename};
-        printf("Received request: %s\n", filename.c_str());
-        // audio arrays
-        std::vector<float> pcmf32;               // mono-channel F32 PCM
-        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
         if (sparams.ffmpeg_converter) {
-            // if file is not wav, convert to wav
-            // write to temporary file
             const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
             std::ofstream temp_file{temp_filename, std::ios::binary};
             temp_file << audio_file.content;
             temp_file.close();
-            std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
-            const bool is_converted = convert_to_wav(temp_filename, error_resp);
-            if (!is_converted) {
                 res.set_content(error_resp, "application/json");
                 return;
             }
-            // read audio content into pcmf32
-            if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, params.diarize))
-            {
-                fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
                 const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
                 res.set_content(error_resp, "application/json");
                 std::remove(temp_filename.c_str());
                 return;
             }
-            // remove temp file
             std::remove(temp_filename.c_str());
         } else {
-            if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
-            {
-                fprintf(stderr, "error: failed to read audio data\n");
                 const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
                 res.set_content(error_resp, "application/json");
                 return;
             }
         }
-        printf("Successfully loaded %s\n", filename.c_str());
-        // print system information
         {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
         }
-        // print some info about the processing
         {
-            fprintf(stderr, "\n");
-            if (!whisper_is_multilingual(ctx)) {
-                if (params.language != "en" || params.translate) {
-                    params.language = "en";
-                    params.translate = false;
-                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-                }
             }
-            if (params.detect_language) {
-                params.language = "auto";
-            }
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
-                    __func__, filename.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, params.n_processors,
-                    params.language.c_str(),
-                    params.translate ? "translate" : "transcribe",
-                    params.tinydiarize ? "tdrz = 1, " : "",
-                    params.no_timestamps ? 0 : 1);
-            fprintf(stderr, "\n");
         }
-        // run the inference
-        {
-            printf("Running whisper.cpp inference on %s\n", filename.c_str());
-            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
-            wparams.print_realtime   = false;
-            wparams.print_progress   = params.print_progress;
-            wparams.print_timestamps = !params.no_timestamps;
-            wparams.print_special    = params.print_special;
-            wparams.translate        = params.translate;
-            wparams.language         = params.language.c_str();
-            wparams.detect_language  = params.detect_language;
-            wparams.n_threads        = params.n_threads;
-            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-            wparams.offset_ms        = params.offset_t_ms;
-            wparams.duration_ms      = params.duration_ms;
-            wparams.thold_pt         = params.word_thold;
-            wparams.max_len          = params.max_len == 0 ? 60 : params.max_len;
-            wparams.split_on_word    = params.split_on_word;
-            wparams.audio_ctx        = params.audio_ctx;
-            wparams.debug_mode       = params.debug_mode;
-            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
-            wparams.initial_prompt   = params.prompt.c_str();
-            wparams.greedy.best_of        = params.best_of;
-            wparams.beam_search.beam_size = params.beam_size;
-            wparams.temperature      = params.temperature;
-            wparams.no_speech_thold = params.no_speech_thold;
-            wparams.temperature_inc  = params.temperature_inc;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
-            wparams.no_timestamps    = params.no_timestamps;
-            wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
-            wparams.no_context       = params.no_context;
-            wparams.suppress_nst     = params.suppress_nst;
-            wparams.vad              = params.vad;
-            wparams.vad_model_path   = params.vad_model.c_str();
-            wparams.vad_params.threshold               = params.vad_threshold;
-            wparams.vad_params.min_speech_duration_ms  = params.vad_min_speech_duration_ms;
-            wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
-            wparams.vad_params.max_speech_duration_s   = params.vad_max_speech_duration_s;
-            wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
-            wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
-            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
-            // this callback is called on each new segment
-            if (params.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
-                wparams.new_segment_callback_user_data = &user_data;
             }
-            if (wparams.print_progress) {
-                wparams.progress_callback           = whisper_print_progress_callback;
-                wparams.progress_callback_user_data = &user_data;
             }
-            // tell whisper to abort if the HTTP connection closed
-            wparams.abort_callback = [](void *user_data) {
-                // user_data is a pointer to our Request
-                auto req_ptr = static_cast<const httplib::Request*>(user_data);
-                return req_ptr->is_connection_closed();
-            };
-            wparams.abort_callback_user_data = (void*)&req;
-            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
-                // handle failure or early abort
-                if (req.is_connection_closed()) {
-                    // log client disconnect
-                    fprintf(stderr, "client disconnected, aborted processing\n");
-                    res.status = 499; // Client Closed Request (nginx convention)
-                    res.set_content("{\"error\":\"client disconnected\"}", "application/json");
-                    return;
-                }
-                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
-                res.status = 500; // Internal Server Error
-                const std::string error_resp = "{\"error\":\"failed to process audio\"}";
                 res.set_content(error_resp, "application/json");
                 return;
             }
         }
-        // return results to user
-        if (params.response_format == text_format)
         {
-            std::string results = output_str(ctx, params, pcmf32s);
-            res.set_content(results.c_str(), "text/html; charset=utf-8");
         }
-        else if (params.response_format == srt_format)
-        {
-            std::stringstream ss;
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-                std::string speaker = "";
-                if (params.diarize && pcmf32s.size() == 2)
-                {
-                    speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-                }
-                ss << i + 1 + params.offset_n << "\n";
-                ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
-                ss << speaker << text << "\n\n";
-            }
-            res.set_content(ss.str(), "application/x-subrip");
-        } else if (params.response_format == vtt_format) {
-            std::stringstream ss;
-            ss << "WEBVTT\n\n";
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-                std::string speaker = "";
-                if (params.diarize && pcmf32s.size() == 2)
-                {
-                    speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
-                    speaker.insert(0, "<v Speaker");
-                    speaker.append(">");
-                }
-                ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
-                ss << speaker << text << "\n\n";
-            }
-            res.set_content(ss.str(), "text/vtt");
-        } else if (params.response_format == vjson_format) {
-            /* try to match openai/whisper's Python format */
-            std::string results = output_str(ctx, params, pcmf32s);
-            json jres = json{
-                {"task", params.translate ? "translate" : "transcribe"},
-                {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
-                {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
-                {"text", results},
-                {"segments", json::array()}
-            };
-            // Only compute language probabilities if requested (expensive operation)
-            if (!params.no_language_probabilities) {
-                std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
-                const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
-                jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
-                jres["detected_language_probability"] = lang_probs[detected_lang_id];
-                jres["language_probabilities"] = json::object();
-                // Add all language probabilities
-                for (int i = 0; i <= whisper_lang_max_id(); ++i) {
-                    if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities
-                        jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
-                    }
-                }
             }
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i)
-            {
-                json segment = json{
-                    {"id", i},
-                    {"text", whisper_full_get_segment_text(ctx, i)},
-                };
-                if (!params.no_timestamps) {
-                    segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
-                    segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
-                }
-                float total_logprob = 0;
-                const int n_tokens = whisper_full_n_tokens(ctx, i);
-                for (int j = 0; j < n_tokens; ++j) {
-                    whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
-                    if (token.id >= whisper_token_eot(ctx)) {
-                        continue;
-                    }
-                    segment["tokens"].push_back(token.id);
-                    json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
-                    if (!params.no_timestamps) {
-                        word["start"] = token.t0 * 0.01;
-                        word["end"] = token.t1 * 0.01;
-                        word["t_dtw"] = token.t_dtw;
-                    }
-                    word["probability"] = token.p;
-                    total_logprob += token.plog;
-                    segment["words"].push_back(word);
-                }
-                segment["temperature"] = params.temperature;
-                segment["avg_logprob"] = total_logprob / n_tokens;
-                // TODO compression_ratio and no_speech_prob are not implemented yet
-                // segment["compression_ratio"] = 0;
-                segment["no_speech_prob"] = whisper_full_get_segment_no_speech_prob(ctx, i);
-                jres["segments"].push_back(segment);
             }
-            res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
-                            "application/json");
-        }
-        // TODO add more output formats
-        else
-        {
-            std::string results = output_str(ctx, params, pcmf32s);
-            json jres = json{
-                {"text", results}
-            };
-            res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
-                            "application/json");
         }
-        // reset params to their defaults
-        params = default_params;
     });
     svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
         std::lock_guard<std::mutex> lock(whisper_mutex);

 #include <atomic>
 #include <functional>
 #include <cstdlib>
+#include <unordered_map>
 #if defined (_WIN32)
 #include <windows.h>
 #endif
 }  // namespace
+// Async task management
+namespace {
+    enum class async_status { PENDING, RUNNING, FINISHED, FAILED };
+    struct async_task_t {
+        async_status status = async_status::PENDING;
+        std::string result; // final response body
+        std::string content_type = "application/json";
+        std::string error; // error message if failed
+    };
+    static std::unordered_map<std::string, async_task_t> tasks;
+    static std::mutex tasks_mutex;
+    static std::atomic<uint64_t> task_counter{0};
+    std::string generate_task_id() {
+        const uint64_t id = ++task_counter;
+        auto now = std::chrono::system_clock::now();
+        auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
+        std::stringstream ss;
+        ss << id << "-" << now_ms;
+        return ss.str();
+    }
+}
 int main(int argc, char ** argv) {
     ggml_backend_load_all();
         <style>
         body {
             font-family: sans-serif;
+            max-width: 900px;
+            margin: 1rem auto;
+            padding: 0 1rem;
         }
         form {
             display: flex;
             flex-direction: column;
             align-items: flex-start;
+            margin-bottom: 1.5rem;
         }
         label {
             margin-bottom: 0.5rem;
         }
+        input, select, button, textarea {
             margin-bottom: 1rem;
         }
+        .box { border: 1px solid #ddd; padding: 1rem; border-radius: 6px; }
         </style>
     </head>
     <body>
         <h1>Whisper.cpp Server</h1>
+        <h2>/inference (同步示例)</h2>
         <pre>
     curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
     -H "Content-Type: multipart/form-data" \
     -F model="&lt;path-to-model-file&gt;"
         </pre>
+        <div class="box">
+            <h2>同步 Try it out</h2>
             <form action="/inference" method="POST" enctype="multipart/form-data">
                 <label for="file">Choose an audio file:</label>
+                <input type="file" id="file" name="file" accept="audio/*" required>
                 <label for="response_format">Response Format:</label>
                 <select id="response_format" name="response_format">
                     <option value="text">Text</option>
                     <option value="srt">SRT</option>
                     <option value="vtt">VTT</option>
+                </select>
+                <button type="submit">Run synchronous inference</button>
+            </form>
+        </div>
+        <div class="box">
+            <h2>异步示例（可运行）</h2>
+            <form id="async-form" enctype="multipart/form-data">
+                <label for="afile">Choose an audio file:</label>
+                <input type="file" id="afile" name="file" accept="audio/*" required>
+                <label for="a_response_format">Response Format:</label>
+                <select id="a_response_format" name="response_format">
+                    <option value="json">JSON</option>
+                    <option value="text">Text</option>
+                    <option value="srt">SRT</option>
+                    <option value="vtt">VTT</option>
+                    <option value="verbose_json">Verbose JSON</option>
+                </select>
+                <button id="async-submit" type="button">Submit async job</button>
             </form>
+            <div>
+                <h3>Task</h3>
+                <div id="task-id">(no task)</div>
+                <h3>Result / Status</h3>
+                <pre id="task-result" style="white-space:pre-wrap; background:#f7f7f7; padding:0.5rem; border-radius:4px;"></pre>
+            </div>
         </div>
+        <script>
+        (function(){
+            const submitBtn = document.getElementById('async-submit');
+            const form = document.getElementById('async-form');
+            const taskIdEl = document.getElementById('task-id');
+            const resultEl = document.getElementById('task-result');
+            let currentTask = null;
+            function sleep(ms){ return new Promise(r=>setTimeout(r, ms)); }
+            async function pollTask(id){
+                taskIdEl.textContent = id;
+                resultEl.textContent = 'processing...';
+                while (true) {
+                    try {
+                        const resp = await fetch('/inference_result?id=' + encodeURIComponent(id));
+                        if (resp.status === 404) {
+                            resultEl.textContent = 'task not found';
+                            return;
+                        }
+                        const ctype = resp.headers.get('content-type') || '';
+                        if (ctype.indexOf('application/json') !== -1) {
+                            const j = await resp.json();
+                            // if processing status, continue polling
+                            if (j && j.status && (j.status === 'processing')) {
+                                await sleep(1000);
+                                continue;
+                            }
+                            resultEl.textContent = JSON.stringify(j, null, 2);
+                            return;
+                        } else {
+                            // non-json (final text or srt/vtt)
+                            const txt = await resp.text();
+                            // if it's the processing JSON returned with application/json, handle above
+                            if (txt && txt.indexOf('{"status":"processing"') !== -1) {
+                                await sleep(1000);
+                                continue;
+                            }
+                            resultEl.textContent = txt;
+                            return;
+                        }
+                    } catch (err) {
+                        resultEl.textContent = 'error: ' + err.message;
+                        return;
+                    }
+                }
+            }
+            submitBtn.addEventListener('click', async function(){
+                resultEl.textContent = '';
+                const fileInput = document.getElementById('afile');
+                if (!fileInput.files || fileInput.files.length === 0) {
+                    alert('Please choose a file');
+                    return;
+                }
+                const fd = new FormData();
+                fd.append('file', fileInput.files[0]);
+                fd.append('response_format', document.getElementById('a_response_format').value);
+                submitBtn.disabled = true;
+                submitBtn.textContent = 'Submitting...';
+                try {
+                    const resp = await fetch('/inference_async', { method: 'POST', body: fd });
+                    const j = await resp.json();
+                    if (j && j.task_id) {
+                        currentTask = j.task_id;
+                        pollTask(currentTask);
+                    } else {
+                        resultEl.textContent = 'invalid response: ' + JSON.stringify(j);
+                    }
+                } catch (err) {
+                    resultEl.textContent = 'submit error: ' + err.message;
+                } finally {
+                    submitBtn.disabled = false;
+                    submitBtn.textContent = 'Submit async job';
+                }
+            });
+        })();
+        </script>
     </body>
     </html>
     )";
     svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
     });
+    // Helper: run inference for a prepared audio buffer and params, store response in task
+    auto run_inference_task = [&](const std::string & task_id,
+                                  whisper_params task_params,
+                                  std::vector<float> pcmf32,
+                                  std::vector<std::vector<float>> pcmf32s,
+                                  const Request * orig_req) {
+        {
+            std::lock_guard<std::mutex> tlock(tasks_mutex);
+            tasks[task_id].status = async_status::RUNNING;
+        }
+        try {
+            // set up whisper params
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+            wparams.strategy = task_params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
+            wparams.print_realtime   = false;
+            wparams.print_progress   = task_params.print_progress;
+            wparams.print_timestamps = !task_params.no_timestamps;
+            wparams.print_special    = task_params.print_special;
+            wparams.translate        = task_params.translate;
+            wparams.language         = task_params.language.c_str();
+            wparams.detect_language  = task_params.detect_language;
+            wparams.n_threads        = task_params.n_threads;
+            wparams.n_max_text_ctx   = task_params.max_context >= 0 ? task_params.max_context : wparams.n_max_text_ctx;
+            wparams.offset_ms        = task_params.offset_t_ms;
+            wparams.duration_ms      = task_params.duration_ms;
+            wparams.thold_pt         = task_params.word_thold;
+            wparams.max_len          = task_params.max_len == 0 ? 60 : task_params.max_len;
+            wparams.split_on_word    = task_params.split_on_word;
+            wparams.audio_ctx        = task_params.audio_ctx;
+            wparams.debug_mode       = task_params.debug_mode;
+            wparams.tdrz_enable      = task_params.tinydiarize; // [TDRZ]
+            wparams.initial_prompt   = task_params.prompt.c_str();
+            wparams.greedy.best_of        = task_params.best_of;
+            wparams.beam_search.beam_size = task_params.beam_size;
+            wparams.temperature      = task_params.temperature;
+            wparams.no_speech_thold = task_params.no_speech_thold;
+            wparams.temperature_inc  = task_params.temperature_inc;
+            wparams.entropy_thold    = task_params.entropy_thold;
+            wparams.logprob_thold    = task_params.logprob_thold;
+            wparams.no_timestamps    = task_params.no_timestamps;
+            wparams.token_timestamps = !task_params.no_timestamps && task_params.response_format == vjson_format;
+            wparams.no_context       = task_params.no_context;
+            wparams.suppress_nst     = task_params.suppress_nst;
+            wparams.vad              = task_params.vad;
+            wparams.vad_model_path   = task_params.vad_model.c_str();
+            wparams.vad_params.threshold               = task_params.vad_threshold;
+            wparams.vad_params.min_speech_duration_ms  = task_params.vad_min_speech_duration_ms;
+            wparams.vad_params.min_silence_duration_ms = task_params.vad_min_silence_duration_ms;
+            wparams.vad_params.max_speech_duration_s   = task_params.vad_max_speech_duration_s;
+            wparams.vad_params.speech_pad_ms           = task_params.vad_speech_pad_ms;
+            wparams.vad_params.samples_overlap         = task_params.vad_samples_overlap;
+            whisper_print_user_data user_data = { &task_params, &pcmf32s, 0 };
+            if (task_params.print_realtime) {
+                wparams.new_segment_callback           = whisper_print_segment_callback;
+                wparams.new_segment_callback_user_data = &user_data;
+            }
+            if (wparams.print_progress) {
+                wparams.progress_callback           = whisper_print_progress_callback;
+                wparams.progress_callback_user_data = &user_data;
+            }
+            // abort callback uses original request pointer if provided
+            // ggml_abort_callback expects a function returning bool
+            wparams.abort_callback = [](void *user_data)->bool {
+                if (!user_data) return false;
+                auto req_ptr = static_cast<const httplib::Request*>(user_data);
+                return req_ptr->is_connection_closed();
+            };
+            wparams.abort_callback_user_data = (void*)orig_req;
+            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), task_params.n_processors) != 0) {
+                // failure
+                std::lock_guard<std::mutex> tlock(tasks_mutex);
+                tasks[task_id].status = async_status::FAILED;
+                tasks[task_id].error = "failed to process audio";
+                tasks[task_id].result = "";
+                return;
+            }
+            // prepare response according to format
+            std::string content;
+            std::string ctype = "application/json";
+            if (task_params.response_format == text_format) {
+                content = output_str(ctx, task_params, pcmf32s);
+                ctype = "text/plain; charset=utf-8";
+            } else if (task_params.response_format == srt_format) {
+                std::stringstream ss;
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = 0; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+                    std::string speaker = "";
+                    if (task_params.diarize && pcmf32s.size() == 2) {
+                        speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
+                    }
+                    ss << i + 1 + task_params.offset_n << "\n";
+                    ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
+                    ss << speaker << text << "\n\n";
+                }
+                content = ss.str();
+                ctype = "application/x-subrip";
+            } else if (task_params.response_format == vtt_format) {
+                std::stringstream ss;
+                ss << "WEBVTT\n\n";
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = 0; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+                    std::string speaker = "";
+                    if (task_params.diarize && pcmf32s.size() == 2) {
+                        speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
+                        speaker.insert(0, "<v Speaker");
+                        speaker.append(">");
+                    }
+                    ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
+                    ss << speaker << text << "\n\n";
+                }
+                content = ss.str();
+                ctype = "text/vtt";
+            } else if (task_params.response_format == vjson_format) {
+                std::string results = output_str(ctx, task_params, pcmf32s);
+                json jres = json{
+                    {"task", task_params.translate ? "translate" : "transcribe"},
+                    {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
+                    {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
+                    {"text", results},
+                    {"segments", json::array()}
+                };
+                if (!task_params.no_language_probabilities) {
+                    std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
+                    const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, task_params.n_threads, lang_probs.data());
+                    jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
+                    jres["detected_language_probability"] = lang_probs[detected_lang_id];
+                    jres["language_probabilities"] = json::object();
+                    for (int i = 0; i <= whisper_lang_max_id(); ++i) {
+                        if (lang_probs[i] > 0.001f) {
+                            jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
+                        }
+                    }
+                }
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = 0; i < n_segments; ++i) {
+                    json segment = json{{"id", i}, {"text", whisper_full_get_segment_text(ctx, i)}};
+                    if (!task_params.no_timestamps) {
+                        segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
+                        segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
+                    }
+                    float total_logprob = 0;
+                    const int n_tokens = whisper_full_n_tokens(ctx, i);
+                    for (int j = 0; j < n_tokens; ++j) {
+                        whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
+                        if (token.id >= whisper_token_eot(ctx)) continue;
+                        segment["tokens"].push_back(token.id);
+                        json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
+                        if (!task_params.no_timestamps) {
+                            word["start"] = token.t0 * 0.01;
+                            word["end"] = token.t1 * 0.01;
+                            word["t_dtw"] = token.t_dtw;
+                        }
+                        word["probability"] = token.p;
+                        total_logprob += token.plog;
+                        segment["words"].push_back(word);
+                    }
+                    segment["temperature"] = task_params.temperature;
+                    segment["avg_logprob"] = total_logprob / n_tokens;
+                    segment["no_speech_prob"] = whisper_full_get_segment_no_speech_prob(ctx, i);
+                    jres["segments"].push_back(segment);
+                }
+                content = jres.dump(-1, ' ', false, json::error_handler_t::replace);
+                ctype = "application/json";
+            } else {
+                std::string results = output_str(ctx, task_params, pcmf32s);
+                json jres = json{{"text", results}};
+                content = jres.dump(-1, ' ', false, json::error_handler_t::replace);
+                ctype = "application/json";
+            }
+            // store result
+            {
+                std::lock_guard<std::mutex> tlock(tasks_mutex);
+                tasks[task_id].status = async_status::FINISHED;
+                tasks[task_id].result = content;
+                tasks[task_id].content_type = ctype;
+            }
+        } catch (const std::exception &e) {
+            std::lock_guard<std::mutex> tlock(tasks_mutex);
+            tasks[task_id].status = async_status::FAILED;
+            tasks[task_id].error = e.what();
+            tasks[task_id].result.clear();
+        }
+    };
+    // Synchronous inference kept for compatibility at original path
     svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
+        // existing synchronous behavior: simply call async helper synchronously while holding mutex
         std::lock_guard<std::mutex> lock(whisper_mutex);
+        if (!req.has_file("file")) {
             const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
             res.set_content(error_resp, "application/json");
             return;
         }
         auto audio_file = req.get_file_value("file");
+        // gather parameters
         get_req_parameters(req, params);
+        std::vector<float> pcmf32;
+        std::vector<std::vector<float>> pcmf32s;
         if (sparams.ffmpeg_converter) {
             const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
             std::ofstream temp_file{temp_filename, std::ios::binary};
             temp_file << audio_file.content;
             temp_file.close();
+            std::string error_resp;
+            if (!convert_to_wav(temp_filename, error_resp)) {
                 res.set_content(error_resp, "application/json");
                 return;
             }
+            if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, params.diarize)) {
                 const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
                 res.set_content(error_resp, "application/json");
                 std::remove(temp_filename.c_str());
                 return;
             }
             std::remove(temp_filename.c_str());
         } else {
+            if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize)) {
                 const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
                 res.set_content(error_resp, "application/json");
                 return;
             }
         }
+        // create a temporary task id to run synchronously
+        const std::string tmp_task_id = generate_task_id();
         {
+            std::lock_guard<std::mutex> tlock(tasks_mutex);
+            tasks[tmp_task_id] = async_task_t();
+            tasks[tmp_task_id].status = async_status::PENDING;
         }
+        // run in same thread
+        run_inference_task(tmp_task_id, params, std::move(pcmf32), std::move(pcmf32s), &req);
+        // return the stored result
         {
+            std::lock_guard<std::mutex> tlock(tasks_mutex);
+            if (tasks[tmp_task_id].status == async_status::FINISHED) {
+                res.set_content(tasks[tmp_task_id].result, tasks[tmp_task_id].content_type);
+            } else if (tasks[tmp_task_id].status == async_status::FAILED) {
+                const std::string err = tasks[tmp_task_id].error.empty() ? "{\"error\":\"failed\"}" : tasks[tmp_task_id].error;
+                res.set_content(err, "application/json");
+            } else {
+                res.set_content("{\"status\":\"processing\"}", "application/json");
             }
+            tasks.erase(tmp_task_id);
         }
+    });
+    // POST /inference_async -> enqueue background task and return task id
+    svr->Post(sparams.request_path + "/inference_async", [&](const Request &req, Response &res){
+        if (!req.has_file("file")) {
+            const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
+            res.set_content(error_resp, "application/json");
+            return;
+        }
+        // prepare params and audio buffers without holding model mutex for long
+        whisper_params task_params = params; // copy default base
+        get_req_parameters(req, task_params);
+        auto audio_file = req.get_file_value("file");
+        std::vector<float> pcmf32;
+        std::vector<std::vector<float>> pcmf32s;
+        if (sparams.ffmpeg_converter) {
+            const std::string temp_filename = generate_temp_filename("whisper-server", ".wav");
+            std::ofstream temp_file{temp_filename, std::ios::binary};
+            temp_file << audio_file.content;
+            temp_file.close();
+            std::string error_resp;
+            if (!convert_to_wav(temp_filename, error_resp)) {
+                res.set_content(error_resp, "application/json");
+                return;
             }
+            if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, task_params.diarize)) {
+                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+                res.set_content(error_resp, "application/json");
+                std::remove(temp_filename.c_str());
+                return;
             }
+            std::remove(temp_filename.c_str());
+        } else {
+            if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, task_params.diarize)) {
+                const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
                 res.set_content(error_resp, "application/json");
                 return;
             }
         }
+        // create task id and store placeholder
+        const std::string task_id = generate_task_id();
         {
+            std::lock_guard<std::mutex> tlock(tasks_mutex);
+            tasks[task_id] = async_task_t();
+            tasks[task_id].status = async_status::PENDING;
         }
+        // spawn background worker thread
+        std::thread worker([&, task_id, task_params, pcmf32 = std::move(pcmf32), pcmf32s = std::move(pcmf32s)]() mutable {
+            // ensure only one inference runs at a time
+            std::lock_guard<std::mutex> lock(whisper_mutex);
+            // Do not pass pointer to the Request object into background thread - it will be out of scope
+            run_inference_task(task_id, task_params, std::move(pcmf32), std::move(pcmf32s), nullptr);
+        });
+        worker.detach();
+        json j = json{{"task_id", task_id}};
+        res.set_content(j.dump(), "application/json");
+    });
+    // GET /inference_result?id=<task_id> -> return status/result
+    svr->Get(sparams.request_path + "/inference_result", [&](const Request &req, Response &res){
+        if (!req.has_param("id")) {
+            res.set_content("{\"error\":\"missing id parameter\"}", "application/json");
+            return;
+        }
+        const std::string id = req.get_param_value("id");
+        // copy needed data while holding lock, then release lock and send response
+        std::string out_body;
+        std::string out_ctype;
+        {
+            std::lock_guard<std::mutex> tlock(tasks_mutex);
+            auto it = tasks.find(id);
+            if (it == tasks.end()) {
+                res.set_content("{\"error\":\"task not found\"}", "application/json");
+                res.status = 404;
+                return;
             }
+            const auto & t = it->second;
+            if (t.status == async_status::PENDING || t.status == async_status::RUNNING) {
+                json j = { {"status", "processing"} };
+                res.set_content(j.dump(), "application/json");
+                return;
+            }
+            if (t.status == async_status::FAILED) {
+                json j = { {"status", "failed"}, {"error", t.error} };
+                out_body = j.dump();
+                out_ctype = "application/json";
+                // remove failed task from map to avoid accumulation
+                tasks.erase(it);
+            } else {
+                // FINISHED: copy and erase the task so it's cleaned up after retrieval
+                out_body = t.result;
+                out_ctype = t.content_type;
+                tasks.erase(it);
             }
         }
+        res.set_content(out_body, out_ctype);
     });
     svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
         std::lock_guard<std::mutex> lock(whisper_mutex);