Spaces:
Running
Running
server : add fields to `verbose_json` response (#1802)
Browse files* server: include additional fields in the verbose_json response as OpenAI does
* server: show request examples on home page
* server: todo note for compression_ratio and no_speech_prob
* server: add simple demo form to the homepage
- examples/server/server.cpp +87 -2
examples/server/server.cpp
CHANGED
|
@@ -543,7 +543,76 @@ int main(int argc, char ** argv) {
|
|
| 543 |
{"Access-Control-Allow-Origin", "*"},
|
| 544 |
{"Access-Control-Allow-Headers", "content-type"}});
|
| 545 |
|
| 546 |
-
std::string const default_content = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 547 |
|
| 548 |
// store default params so we can reset after each inference request
|
| 549 |
whisper_params default_params = params;
|
|
@@ -787,7 +856,13 @@ int main(int argc, char ** argv) {
|
|
| 787 |
} else if (params.response_format == vjson_format) {
|
| 788 |
/* try to match openai/whisper's Python format */
|
| 789 |
std::string results = output_str(ctx, params, pcmf32s);
|
| 790 |
-
json jres = json{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 792 |
for (int i = 0; i < n_segments; ++i)
|
| 793 |
{
|
|
@@ -801,6 +876,7 @@ int main(int argc, char ** argv) {
|
|
| 801 |
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
|
| 802 |
}
|
| 803 |
|
|
|
|
| 804 |
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
| 805 |
for (int j = 0; j < n_tokens; ++j) {
|
| 806 |
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
|
|
@@ -815,8 +891,17 @@ int main(int argc, char ** argv) {
|
|
| 815 |
word["end"] = token.t1 * 0.01;
|
| 816 |
}
|
| 817 |
word["probability"] = token.p;
|
|
|
|
| 818 |
segment["words"].push_back(word);
|
| 819 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
jres["segments"].push_back(segment);
|
| 821 |
}
|
| 822 |
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
|
|
|
|
| 543 |
{"Access-Control-Allow-Origin", "*"},
|
| 544 |
{"Access-Control-Allow-Headers", "content-type"}});
|
| 545 |
|
| 546 |
+
std::string const default_content = R"(
|
| 547 |
+
<html>
|
| 548 |
+
<head>
|
| 549 |
+
<title>Whisper.cpp Server</title>
|
| 550 |
+
<meta charset="utf-8">
|
| 551 |
+
<meta name="viewport" content="width=device-width">
|
| 552 |
+
<style>
|
| 553 |
+
body {
|
| 554 |
+
font-family: sans-serif;
|
| 555 |
+
}
|
| 556 |
+
form {
|
| 557 |
+
display: flex;
|
| 558 |
+
flex-direction: column;
|
| 559 |
+
align-items: flex-start;
|
| 560 |
+
}
|
| 561 |
+
label {
|
| 562 |
+
margin-bottom: 0.5rem;
|
| 563 |
+
}
|
| 564 |
+
input, select {
|
| 565 |
+
margin-bottom: 1rem;
|
| 566 |
+
}
|
| 567 |
+
button {
|
| 568 |
+
margin-top: 1rem;
|
| 569 |
+
}
|
| 570 |
+
</style>
|
| 571 |
+
</head>
|
| 572 |
+
<body>
|
| 573 |
+
<h1>Whisper.cpp Server</h1>
|
| 574 |
+
|
| 575 |
+
<h2>/inference</h2>
|
| 576 |
+
<pre>
|
| 577 |
+
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
|
| 578 |
+
-H "Content-Type: multipart/form-data" \
|
| 579 |
+
-F file="@<file-path>" \
|
| 580 |
+
-F temperature="0.0" \
|
| 581 |
+
-F temperature_inc="0.2" \
|
| 582 |
+
-F response_format="json"
|
| 583 |
+
</pre>
|
| 584 |
+
|
| 585 |
+
<h2>/load</h2>
|
| 586 |
+
<pre>
|
| 587 |
+
curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
|
| 588 |
+
-H "Content-Type: multipart/form-data" \
|
| 589 |
+
-F model="<path-to-model-file>"
|
| 590 |
+
</pre>
|
| 591 |
+
|
| 592 |
+
<div>
|
| 593 |
+
<h2>Try it out</h2>
|
| 594 |
+
<form action="/inference" method="POST" enctype="multipart/form-data">
|
| 595 |
+
<label for="file">Choose an audio file:</label>
|
| 596 |
+
<input type="file" id="file" name="file" accept="audio/*" required><br>
|
| 597 |
+
|
| 598 |
+
<label for="temperature">Temperature:</label>
|
| 599 |
+
<input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
|
| 600 |
+
|
| 601 |
+
<label for="response_format">Response Format:</label>
|
| 602 |
+
<select id="response_format" name="response_format">
|
| 603 |
+
<option value="verbose_json">Verbose JSON</option>
|
| 604 |
+
<option value="json">JSON</option>
|
| 605 |
+
<option value="text">Text</option>
|
| 606 |
+
<option value="srt">SRT</option>
|
| 607 |
+
<option value="vtt">VTT</option>
|
| 608 |
+
</select><br>
|
| 609 |
+
|
| 610 |
+
<button type="submit">Submit</button>
|
| 611 |
+
</form>
|
| 612 |
+
</div>
|
| 613 |
+
</body>
|
| 614 |
+
</html>
|
| 615 |
+
)";
|
| 616 |
|
| 617 |
// store default params so we can reset after each inference request
|
| 618 |
whisper_params default_params = params;
|
|
|
|
| 856 |
} else if (params.response_format == vjson_format) {
|
| 857 |
/* try to match openai/whisper's Python format */
|
| 858 |
std::string results = output_str(ctx, params, pcmf32s);
|
| 859 |
+
json jres = json{
|
| 860 |
+
{"task", params.translate ? "translate" : "transcribe"},
|
| 861 |
+
{"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
|
| 862 |
+
{"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
|
| 863 |
+
{"text", results},
|
| 864 |
+
{"segments", json::array()}
|
| 865 |
+
};
|
| 866 |
const int n_segments = whisper_full_n_segments(ctx);
|
| 867 |
for (int i = 0; i < n_segments; ++i)
|
| 868 |
{
|
|
|
|
| 876 |
segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
|
| 877 |
}
|
| 878 |
|
| 879 |
+
float total_logprob = 0;
|
| 880 |
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
| 881 |
for (int j = 0; j < n_tokens; ++j) {
|
| 882 |
whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
|
|
|
|
| 891 |
word["end"] = token.t1 * 0.01;
|
| 892 |
}
|
| 893 |
word["probability"] = token.p;
|
| 894 |
+
total_logprob += token.plog;
|
| 895 |
segment["words"].push_back(word);
|
| 896 |
}
|
| 897 |
+
|
| 898 |
+
segment["temperature"] = params.temperature;
|
| 899 |
+
segment["avg_logprob"] = total_logprob / n_tokens;
|
| 900 |
+
|
| 901 |
+
// TODO compression_ratio and no_speech_prob are not implemented yet
|
| 902 |
+
// segment["compression_ratio"] = 0;
|
| 903 |
+
// segment["no_speech_prob"] = 0;
|
| 904 |
+
|
| 905 |
jres["segments"].push_back(segment);
|
| 906 |
}
|
| 907 |
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
|