ggerganov commited on
Commit
ce2cd6d
·
1 Parent(s): fd68d89

ref #9 : add API documentation in whisper.h

Browse files
Files changed (2) hide show
  1. main.cpp +4 -4
  2. whisper.h +69 -5
main.cpp CHANGED
@@ -230,25 +230,25 @@ int main(int argc, char ** argv) {
230
 
231
  // print result
232
  if (!wparams.print_realtime) {
233
- fprintf(stderr, "\n");
234
 
235
  const int n_segments = whisper_full_n_segments(ctx);
236
  for (int i = 0; i < n_segments; ++i) {
237
  const char * text = whisper_full_get_segment_text(ctx, i);
238
 
239
  if (params.no_timestamps) {
240
- fprintf(stderr, "%s", text);
241
  fflush(stdout);
242
  } else {
243
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
244
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
245
 
246
- fprintf(stderr, "[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
247
  }
248
  }
249
  }
250
 
251
- fprintf(stderr, "\n");
252
 
253
  // output to text file
254
  if (params.output_txt) {
 
230
 
231
  // print result
232
  if (!wparams.print_realtime) {
233
+ printf("\n");
234
 
235
  const int n_segments = whisper_full_n_segments(ctx);
236
  for (int i = 0; i < n_segments; ++i) {
237
  const char * text = whisper_full_get_segment_text(ctx, i);
238
 
239
  if (params.no_timestamps) {
240
+ printf("%s", text);
241
  fflush(stdout);
242
  } else {
243
  const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
244
  const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
245
 
246
+ printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
247
  }
248
  }
249
  }
250
 
251
+ printf("\n");
252
 
253
  // output to text file
254
  if (params.output_txt) {
whisper.h CHANGED
@@ -31,33 +31,81 @@ extern "C" {
31
  // C interface
32
  //
33
 
34
- // TODO: documentation will come soon
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  struct whisper_context;
37
 
38
  typedef int whisper_token;
39
 
 
 
40
  WHISPER_API struct whisper_context * whisper_init(const char * path_model);
 
 
41
  WHISPER_API void whisper_free(struct whisper_context * ctx);
42
 
 
 
 
43
  WHISPER_API int whisper_pcm_to_mel(
44
  struct whisper_context * ctx,
45
  const float * samples,
46
  int n_samples,
47
  int n_threads);
48
 
 
 
49
  // n_mel must be 80
 
50
  WHISPER_API int whisper_set_mel(
51
  struct whisper_context * ctx,
52
  const float * data,
53
  int n_len,
54
  int n_mel);
55
 
 
 
 
 
56
  WHISPER_API int whisper_encode(
57
  struct whisper_context * ctx,
58
  int offset,
59
  int n_threads);
60
 
 
 
 
 
 
61
  WHISPER_API int whisper_decode(
62
  struct whisper_context * ctx,
63
  const whisper_token * tokens,
@@ -65,10 +113,15 @@ extern "C" {
65
  int n_past,
66
  int n_threads);
67
 
 
 
 
 
 
68
  WHISPER_API whisper_token whisper_sample_best(struct whisper_context * ctx, bool need_timestamp);
69
  WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
70
 
71
- // return the id of the specified language, returns -1 if not found
72
  WHISPER_API int whisper_lang_id(const char * lang);
73
 
74
  WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
@@ -76,10 +129,13 @@ extern "C" {
76
  WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
77
  WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
78
 
 
79
  WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
80
 
 
81
  WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
82
 
 
83
  WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
84
  WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
85
  WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
@@ -87,16 +143,19 @@ extern "C" {
87
  WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
88
  WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
89
 
 
90
  WHISPER_API whisper_token whisper_token_translate ();
91
  WHISPER_API whisper_token whisper_token_transcribe();
92
 
 
93
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
94
 
95
  ////////////////////////////////////////////////////////////////////////////
96
 
 
97
  enum whisper_decode_strategy {
98
- WHISPER_DECODE_GREEDY,
99
- WHISPER_DECODE_BEAM_SEARCH,
100
  };
101
 
102
  struct whisper_full_params {
@@ -129,18 +188,23 @@ extern "C" {
129
 
130
  WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_decode_strategy strategy);
131
 
132
- // full whisper run - encode + decode
 
133
  WHISPER_API int whisper_full(
134
  struct whisper_context * ctx,
135
  struct whisper_full_params params,
136
  const float * samples,
137
  int n_samples);
138
 
 
 
139
  WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
140
 
 
141
  WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
142
  WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
143
 
 
144
  WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
145
 
146
  #ifdef __cplusplus
 
31
  // C interface
32
  //
33
 
34
+ //
35
+ // Basic usage:
36
+ //
37
+ // #include "whisper.h"
38
+ //
39
+ // ...
40
+ //
41
+ // struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
42
+ //
43
+ // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
44
+ // fprintf(stderr, "failed to process audio\n");
45
+ // return 7;
46
+ // }
47
+ //
48
+ // const int n_segments = whisper_full_n_segments(ctx);
49
+ // for (int i = 0; i < n_segments; ++i) {
50
+ // const char * text = whisper_full_get_segment_text(ctx, i);
51
+ // printf("%s", text);
52
+ // }
53
+ //
54
+ // whisper_free(ctx);
55
+ //
56
+ // ...
57
+ //
58
+ // This is a demonstration of the most straightforward usage of the library.
59
+ // "pcmf32" contains the RAW audio data in 32-bit floating point format.
60
+ //
61
+ // The interface also allows for more fine-grained control over the computation, but it requires a deeper
62
+ // understanding of how the model works.
63
+ //
64
 
65
  struct whisper_context;
66
 
67
  typedef int whisper_token;
68
 
69
+ // Allocates all memory needed for the model and loads the model from the given file.
70
+ // Returns NULL on failure.
71
  WHISPER_API struct whisper_context * whisper_init(const char * path_model);
72
+
73
+ // Frees all memory allocated by the model.
74
  WHISPER_API void whisper_free(struct whisper_context * ctx);
75
 
76
+ // Convert RAW PCM audio to log mel spectrogram.
77
+ // The resulting spectrogram is stored inside the provided whisper context.
78
+ // Returns 0 on success
79
  WHISPER_API int whisper_pcm_to_mel(
80
  struct whisper_context * ctx,
81
  const float * samples,
82
  int n_samples,
83
  int n_threads);
84
 
85
+ // This can be used to set a custom log mel spectrogram inside the provided whisper context.
86
+ // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
87
  // n_mel must be 80
88
+ // Returns 0 on success
89
  WHISPER_API int whisper_set_mel(
90
  struct whisper_context * ctx,
91
  const float * data,
92
  int n_len,
93
  int n_mel);
94
 
95
+ // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
96
+ // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
97
+ // offset can be used to specify the offset of the first frame in the spectrogram.
98
+ // Returns 0 on success
99
  WHISPER_API int whisper_encode(
100
  struct whisper_context * ctx,
101
  int offset,
102
  int n_threads);
103
 
104
+ // Run the Whisper decoder to obtain the logits and probabilities for the next token.
105
+ // Make sure to call whisper_encode() first.
106
+ // tokens + n_tokens is the provided context for the decoder.
107
+ // n_past is the number of tokens to use from previous decoder calls.
108
+ // Returns 0 on success
109
  WHISPER_API int whisper_decode(
110
  struct whisper_context * ctx,
111
  const whisper_token * tokens,
 
113
  int n_past,
114
  int n_threads);
115
 
116
+ // Token sampling methods.
117
+ // These are provided for convenience and can be used after each call to whisper_decode().
118
+ // You can also implement your own sampling method using the whisper_get_probs() function.
119
+ // whisper_sample_best() returns the token with the highest probability
120
+ // whisper_sample_timestamp() returns the most probable timestamp token
121
  WHISPER_API whisper_token whisper_sample_best(struct whisper_context * ctx, bool need_timestamp);
122
  WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
123
 
124
+ // Return the id of the specified language, returns -1 if not found
125
  WHISPER_API int whisper_lang_id(const char * lang);
126
 
127
  WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
 
129
  WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
130
  WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
131
 
132
+ // The probabilities for the next token
133
  WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
134
 
135
+ // Token Id -> String. Uses the vocabulary in the provided context
136
  WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
137
 
138
+ // Special tokens
139
  WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
140
  WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
141
  WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
 
143
  WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
144
  WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
145
 
146
+ // Task tokens
147
  WHISPER_API whisper_token whisper_token_translate ();
148
  WHISPER_API whisper_token whisper_token_transcribe();
149
 
150
+ // Performance information
151
  WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
152
 
153
  ////////////////////////////////////////////////////////////////////////////
154
 
155
+ // Available decoding strategies
156
  enum whisper_decode_strategy {
157
+ WHISPER_DECODE_GREEDY, // Always select the most probable token
158
+ WHISPER_DECODE_BEAM_SEARCH, // TODO: not implemented yet!
159
  };
160
 
161
  struct whisper_full_params {
 
188
 
189
  WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_decode_strategy strategy);
190
 
191
+ // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
192
+ // Uses the specified decoding strategy to obtain the text.
193
  WHISPER_API int whisper_full(
194
  struct whisper_context * ctx,
195
  struct whisper_full_params params,
196
  const float * samples,
197
  int n_samples);
198
 
199
+ // Number of generated text segments.
200
+ // A segment can be a few words, a sentence, or even a paragraph.
201
  WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
202
 
203
+ // Get the start and end time of the specified segment.
204
  WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
205
  WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
206
 
207
+ // Get the text of the specified segment.
208
  WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
209
 
210
  #ifdef __cplusplus