JacobLinCool commited on
Commit
763d09d
·
unverified ·
1 Parent(s): 587152f

server : add fields to `verbose_json` response (#1802)

Browse files

* server: include additional fields in the verbose_json response as OpenAI does

* server: show request examples on home page

* server: todo note for compression_ratio and no_speech_prob

* server: add simple demo form to the homepage

Files changed (1) hide show
  1. examples/server/server.cpp +87 -2
examples/server/server.cpp CHANGED
@@ -543,7 +543,76 @@ int main(int argc, char ** argv) {
543
  {"Access-Control-Allow-Origin", "*"},
544
  {"Access-Control-Allow-Headers", "content-type"}});
545
 
546
- std::string const default_content = "<html>hello</html>";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
547
 
548
  // store default params so we can reset after each inference request
549
  whisper_params default_params = params;
@@ -787,7 +856,13 @@ int main(int argc, char ** argv) {
787
  } else if (params.response_format == vjson_format) {
788
  /* try to match openai/whisper's Python format */
789
  std::string results = output_str(ctx, params, pcmf32s);
790
- json jres = json{{"text", results}};
 
 
 
 
 
 
791
  const int n_segments = whisper_full_n_segments(ctx);
792
  for (int i = 0; i < n_segments; ++i)
793
  {
@@ -801,6 +876,7 @@ int main(int argc, char ** argv) {
801
  segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
802
  }
803
 
 
804
  const int n_tokens = whisper_full_n_tokens(ctx, i);
805
  for (int j = 0; j < n_tokens; ++j) {
806
  whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
@@ -815,8 +891,17 @@ int main(int argc, char ** argv) {
815
  word["end"] = token.t1 * 0.01;
816
  }
817
  word["probability"] = token.p;
 
818
  segment["words"].push_back(word);
819
  }
 
 
 
 
 
 
 
 
820
  jres["segments"].push_back(segment);
821
  }
822
  res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
 
543
  {"Access-Control-Allow-Origin", "*"},
544
  {"Access-Control-Allow-Headers", "content-type"}});
545
 
546
+ std::string const default_content = R"(
547
+ <html>
548
+ <head>
549
+ <title>Whisper.cpp Server</title>
550
+ <meta charset="utf-8">
551
+ <meta name="viewport" content="width=device-width">
552
+ <style>
553
+ body {
554
+ font-family: sans-serif;
555
+ }
556
+ form {
557
+ display: flex;
558
+ flex-direction: column;
559
+ align-items: flex-start;
560
+ }
561
+ label {
562
+ margin-bottom: 0.5rem;
563
+ }
564
+ input, select {
565
+ margin-bottom: 1rem;
566
+ }
567
+ button {
568
+ margin-top: 1rem;
569
+ }
570
+ </style>
571
+ </head>
572
+ <body>
573
+ <h1>Whisper.cpp Server</h1>
574
+
575
+ <h2>/inference</h2>
576
+ <pre>
577
+ curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
578
+ -H "Content-Type: multipart/form-data" \
579
+ -F file="@&lt;file-path&gt;" \
580
+ -F temperature="0.0" \
581
+ -F temperature_inc="0.2" \
582
+ -F response_format="json"
583
+ </pre>
584
+
585
+ <h2>/load</h2>
586
+ <pre>
587
+ curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
588
+ -H "Content-Type: multipart/form-data" \
589
+ -F model="&lt;path-to-model-file&gt;"
590
+ </pre>
591
+
592
+ <div>
593
+ <h2>Try it out</h2>
594
+ <form action="/inference" method="POST" enctype="multipart/form-data">
595
+ <label for="file">Choose an audio file:</label>
596
+ <input type="file" id="file" name="file" accept="audio/*" required><br>
597
+
598
+ <label for="temperature">Temperature:</label>
599
+ <input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
600
+
601
+ <label for="response_format">Response Format:</label>
602
+ <select id="response_format" name="response_format">
603
+ <option value="verbose_json">Verbose JSON</option>
604
+ <option value="json">JSON</option>
605
+ <option value="text">Text</option>
606
+ <option value="srt">SRT</option>
607
+ <option value="vtt">VTT</option>
608
+ </select><br>
609
+
610
+ <button type="submit">Submit</button>
611
+ </form>
612
+ </div>
613
+ </body>
614
+ </html>
615
+ )";
616
 
617
  // store default params so we can reset after each inference request
618
  whisper_params default_params = params;
 
856
  } else if (params.response_format == vjson_format) {
857
  /* try to match openai/whisper's Python format */
858
  std::string results = output_str(ctx, params, pcmf32s);
859
+ json jres = json{
860
+ {"task", params.translate ? "translate" : "transcribe"},
861
+ {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
862
+ {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
863
+ {"text", results},
864
+ {"segments", json::array()}
865
+ };
866
  const int n_segments = whisper_full_n_segments(ctx);
867
  for (int i = 0; i < n_segments; ++i)
868
  {
 
876
  segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
877
  }
878
 
879
+ float total_logprob = 0;
880
  const int n_tokens = whisper_full_n_tokens(ctx, i);
881
  for (int j = 0; j < n_tokens; ++j) {
882
  whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
 
891
  word["end"] = token.t1 * 0.01;
892
  }
893
  word["probability"] = token.p;
894
+ total_logprob += token.plog;
895
  segment["words"].push_back(word);
896
  }
897
+
898
+ segment["temperature"] = params.temperature;
899
+ segment["avg_logprob"] = total_logprob / n_tokens;
900
+
901
+ // TODO compression_ratio and no_speech_prob are not implemented yet
902
+ // segment["compression_ratio"] = 0;
903
+ // segment["no_speech_prob"] = 0;
904
+
905
  jres["segments"].push_back(segment);
906
  }
907
  res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),