Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

ggerganov commited on Feb 4, 2023

Commit

3aa9e6c

unverified ·

1 Parent(s): 2439857

whisper : reduce memory usage during inference (#431)

Browse files

* ggml : add "scratch" buffer support

* ggml : support for scratch ring-buffer

* ggml : bug fix in ggml_repeat()

* ggml : error on scratch buffer overflow

* whisper : use scratch buffers during inference (base model only)

* whisper : update memory usage for all models

* whisper : fix encoder memory usage

* whisper : use whisper_context functions instead of macros

* whisper : fix FF + remove it from README

* ggml : reuse ggml_new_i32

* ggml : refactor the scratch buffer storage

* whisper : reorder scratch buffers in the decoder

* main : add option to disable temp fallback

* Update README.md

Files changed (7) hide show

README.md +109 -98
bindings/javascript/whisper.js +0 -0
examples/main/README.md +31 -21
examples/main/main.cpp +19 -14
ggml.c +88 -35
ggml.h +9 -0
whisper.cpp +392 -250

README.md CHANGED Viewed

@@ -13,7 +13,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
-- Low memory usage (Flash Attention + Flash Forward)
 - Zero memory allocations at runtime
 - Runs on the CPU
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
@@ -89,35 +89,37 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
 usage: ./main [options] file0.wav file1.wav ...
 options:
-  -h,       --help            [default] show this help message and exit
-  -t N,     --threads N       [4      ] number of threads to use during computation
-  -p N,     --processors N    [1      ] number of processors to use during computation
-  -ot N,    --offset-t N      [0      ] time offset in milliseconds
-  -on N,    --offset-n N      [0      ] segment index offset
-  -d  N,    --duration N      [0      ] duration of audio to process in milliseconds
-  -mc N,    --max-context N   [-1     ] maximum number of text context tokens to store
-  -ml N,    --max-len N       [0      ] maximum segment length in characters
-  -bo N,    --best-of N       [5      ] number of best candidates to keep
-  -bs N,    --beam-size N     [-1     ] beam size for beam search
-  -wt N,    --word-thold N    [0.01   ] word timestamp probability threshold
-  -et N,    --entropy-thold N [2.40   ] entropy threshold for decoder fail
-  -lpt N,   --logprob-thold N [-1.00  ] log probability threshold for decoder fail
-  -su,      --speed-up        [false  ] speed up audio by x2 (reduced accuracy)
-  -tr,      --translate       [false  ] translate from source language to english
-  -di,      --diarize         [false  ] stereo audio diarization
-  -otxt,    --output-txt      [false  ] output result in a text file
-  -ovtt,    --output-vtt      [false  ] output result in a vtt file
-  -osrt,    --output-srt      [false  ] output result in a srt file
-  -owts,    --output-words    [false  ] output script for generating karaoke video
-  -ocsv,    --output-csv      [false  ] output result in a CSV file
-  -ps,      --print-special   [false  ] print special tokens
-  -pc,      --print-colors    [false  ] print colors
-  -pp,      --print-progress  [false  ] print progress
-  -nt,      --no-timestamps   [true   ] do not print timestamps
-  -l LANG,  --language LANG   [en     ] spoken language ('auto' for auto-detect)
-            --prompt PROMPT   [       ] initial prompt
-  -m FNAME, --model FNAME     [models/ggml-base.en.bin] model path
-  -f FNAME, --file FNAME      [       ] input WAV file path
 bash ./models/download-ggml-model.sh base.en
@@ -137,7 +139,8 @@ Running base.en on all samples in ./samples ...
 [+] Running base.en on samples/jfk.wav ... (run 'ffplay samples/jfk.wav' to listen)
 ----------------------------------------------
-whisper_model_load: loading model from 'models/ggml-base.en.bin'
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 512
@@ -150,13 +153,14 @@ whisper_model_load: n_text_layer  = 6
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: mem_required  =  506.00 MB
-whisper_model_load: ggml ctx size =  140.60 MB
-whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB
-system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
@@ -164,12 +168,13 @@ main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 proc
 [00:00:00.000 --> 00:00:11.000]   And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
-whisper_print_timings:     load time =   105.91 ms
-whisper_print_timings:      mel time =    24.62 ms
-whisper_print_timings:   sample time =     3.63 ms
-whisper_print_timings:   encode time =   324.71 ms / 54.12 ms per layer
-whisper_print_timings:   decode time =    83.58 ms / 13.93 ms per layer
-whisper_print_timings:    total time =   542.81 ms
 ```
 The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
@@ -212,11 +217,11 @@ make large
 | Model  | Disk   | Mem     | SHA                                        |
 | ---    | ---    | ---     | ---                                        |
-| tiny   |  75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
-| base   | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
-| small  | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
-| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| large  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 ## Limitations
@@ -234,7 +239,8 @@ in about half a minute on a MacBook M1 Pro, using `medium.en` model:
 ```java
 $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
-whisper_model_load: loading model from 'models/ggml-medium.en.bin'
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 1024
@@ -247,55 +253,60 @@ whisper_model_load: n_text_layer  = 24
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 4
-whisper_model_load: mem_required  = 2610.00 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: ggml ctx size = 1644.97 MB
-whisper_model_load: memory size =   182.62 MB
-whisper_model_load: model size  =  1462.12 MB
-main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, lang = en, task = transcribe, timestamps = 1 ...
-[00:00.000 --> 00:08.000]   My fellow Americans, this day has brought terrible news and great sadness to our country.
-[00:08.000 --> 00:17.000]   At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
-[00:17.000 --> 00:23.000]   A short time later, debris was seen falling from the skies above Texas.
-[00:23.000 --> 00:29.000]   The Columbia's lost. There are no survivors.
-[00:29.000 --> 00:32.000]   On board was a crew of seven.
-[00:32.000 --> 00:39.000]   Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
-[00:39.000 --> 00:48.000]   Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
-[00:48.000 --> 00:52.000]   a colonel in the Israeli Air Force.
-[00:52.000 --> 00:58.000]   These men and women assumed great risk in the service to all humanity.
-[00:58.000 --> 01:03.000]   In an age when space flight has come to seem almost routine,
-[01:03.000 --> 01:07.000]   it is easy to overlook the dangers of travel by rocket
-[01:07.000 --> 01:12.000]   and the difficulties of navigating the fierce outer atmosphere of the Earth.
-[01:12.000 --> 01:18.000]   These astronauts knew the dangers, and they faced them willingly,
-[01:18.000 --> 01:23.000]   knowing they had a high and noble purpose in life.
-[01:23.000 --> 01:31.000]   Because of their courage and daring and idealism, we will miss them all the more.
-[01:31.000 --> 01:36.000]   All Americans today are thinking as well of the families of these men and women
-[01:36.000 --> 01:40.000]   who have been given this sudden shock and grief.
-[01:40.000 --> 01:45.000]   You're not alone. Our entire nation grieves with you,
-[01:45.000 --> 01:52.000]   and those you love will always have the respect and gratitude of this country.
-[01:52.000 --> 01:56.000]   The cause in which they died will continue.
-[01:56.000 --> 02:04.000]   Mankind is led into the darkness beyond our world by the inspiration of discovery
-[02:04.000 --> 02:11.000]   and the longing to understand. Our journey into space will go on.
-[02:11.000 --> 02:16.000]   In the skies today, we saw destruction and tragedy.
-[02:16.000 --> 02:22.000]   Yet farther than we can see, there is comfort and hope.
-[02:22.000 --> 02:29.000]   In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
-[02:29.000 --> 02:35.000]   who created all these. He who brings out the starry hosts one by one
-[02:35.000 --> 02:39.000]   and calls them each by name."
-[02:39.000 --> 02:46.000]   Because of His great power and mighty strength, not one of them is missing.
-[02:46.000 --> 02:55.000]   The same Creator who names the stars also knows the names of the seven souls we mourn today.
-[02:55.000 --> 03:01.000]   The crew of the shuttle Columbia did not return safely to earth,
-[03:01.000 --> 03:05.000]   yet we can pray that all are safely home.
-[03:05.000 --> 03:13.000]   May God bless the grieving families, and may God continue to bless America.
-[03:13.000 --> 03:41.000]   Audio
-whisper_print_timings:     load time =   575.92 ms
-whisper_print_timings:      mel time =   230.60 ms
-whisper_print_timings:   sample time =    73.19 ms
-whisper_print_timings:   encode time = 19552.61 ms / 814.69 ms per layer
-whisper_print_timings:   decode time = 13249.96 ms / 552.08 ms per layer
-whisper_print_timings:    total time = 33686.27 ms
 ```
 </details>
@@ -321,14 +332,14 @@ to highlight words with high or low confidence:
 ## Controlling the length of the generated text segments (experimental)
-For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
 ```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
@@ -352,7 +363,7 @@ The `--max-len` argument can be used to obtain word-level timestamps. Simply use
 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...

 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
+- Low memory usage (Flash Attention)
 - Zero memory allocations at runtime
 - Runs on the CPU
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
 usage: ./main [options] file0.wav file1.wav ...
 options:
+  -h,        --help              [default] show this help message and exit
+  -t N,      --threads N         [4      ] number of threads to use during computation
+  -p N,      --processors N      [1      ] number of processors to use during computation
+  -ot N,     --offset-t N        [0      ] time offset in milliseconds
+  -on N,     --offset-n N        [0      ] segment index offset
+  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
+  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
+  -ml N,     --max-len N         [0      ] maximum segment length in characters
+  -bo N,     --best-of N         [5      ] number of best candidates to keep
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
+  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
+  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
+  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
+  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,       --translate         [false  ] translate from source language to english
+  -di,       --diarize           [false  ] stereo audio diarization
+  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
+  -otxt,     --output-txt        [false  ] output result in a text file
+  -ovtt,     --output-vtt        [false  ] output result in a vtt file
+  -osrt,     --output-srt        [false  ] output result in a srt file
+  -owts,     --output-words      [false  ] output script for generating karaoke video
+  -ocsv,     --output-csv        [false  ] output result in a CSV file
+  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
+  -ps,       --print-special     [false  ] print special tokens
+  -pc,       --print-colors      [false  ] print colors
+  -pp,       --print-progress    [false  ] print progress
+  -nt,       --no-timestamps     [true   ] do not print timestamps
+  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
+             --prompt PROMPT     [       ] initial prompt
+  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
+  -f FNAME,  --file FNAME        [       ] input WAV file path
 bash ./models/download-ggml-model.sh base.en
 [+] Running base.en on samples/jfk.wav ... (run 'ffplay samples/jfk.wav' to listen)
 ----------------------------------------------
+whisper_init_from_file: loading model from 'models/ggml-base.en.bin'
+whisper_model_load: loading model
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 512
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
+whisper_model_load: mem required  =  215.00 MB (+    6.00 MB per decoder)
+whisper_model_load: kv self size  =    5.25 MB
+whisper_model_load: kv cross size =   17.58 MB
 whisper_model_load: adding 1607 extra tokens
+whisper_model_load: model ctx     =  140.60 MB
 whisper_model_load: model size    =  140.54 MB
+system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
 [00:00:00.000 --> 00:00:11.000]   And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
+whisper_print_timings:     fallbacks =   0 p /   0 h
+whisper_print_timings:     load time =   113.81 ms
+whisper_print_timings:      mel time =    15.40 ms
+whisper_print_timings:   sample time =    11.58 ms /    27 runs (    0.43 ms per run)
+whisper_print_timings:   encode time =   266.60 ms /     1 runs (  266.60 ms per run)
+whisper_print_timings:   decode time =    66.11 ms /    27 runs (    2.45 ms per run)
+whisper_print_timings:    total time =   476.31 ms
 ```
 The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
 | Model  | Disk   | Mem     | SHA                                        |
 | ---    | ---    | ---     | ---                                        |
+| tiny   |  75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
+| base   | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
+| small  | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
+| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
+| large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 ## Limitations
 ```java
 $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
+whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
+whisper_model_load: loading model
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 1024
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 4
+whisper_model_load: mem required  = 1720.00 MB (+   43.00 MB per decoder)
+whisper_model_load: kv self size  =   42.00 MB
+whisper_model_load: kv cross size =  140.62 MB
 whisper_model_load: adding 1607 extra tokens
+whisper_model_load: model ctx     = 1462.35 MB
+whisper_model_load: model size    = 1462.12 MB
+system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
+main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
+[00:00:00.000 --> 00:00:08.000]   My fellow Americans, this day has brought terrible news and great sadness to our country.
+[00:00:08.000 --> 00:00:17.000]   At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
+[00:00:17.000 --> 00:00:23.000]   A short time later, debris was seen falling from the skies above Texas.
+[00:00:23.000 --> 00:00:29.000]   The Columbia's lost. There are no survivors.
+[00:00:29.000 --> 00:00:32.000]   On board was a crew of seven.
+[00:00:32.000 --> 00:00:39.000]   Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
+[00:00:39.000 --> 00:00:48.000]   Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
+[00:00:48.000 --> 00:00:52.000]   a colonel in the Israeli Air Force.
+[00:00:52.000 --> 00:00:58.000]   These men and women assumed great risk in the service to all humanity.
+[00:00:58.000 --> 00:01:03.000]   In an age when space flight has come to seem almost routine,
+[00:01:03.000 --> 00:01:07.000]   it is easy to overlook the dangers of travel by rocket
+[00:01:07.000 --> 00:01:12.000]   and the difficulties of navigating the fierce outer atmosphere of the Earth.
+[00:01:12.000 --> 00:01:18.000]   These astronauts knew the dangers, and they faced them willingly,
+[00:01:18.000 --> 00:01:23.000]   knowing they had a high and noble purpose in life.
+[00:01:23.000 --> 00:01:31.000]   Because of their courage and daring and idealism, we will miss them all the more.
+[00:01:31.000 --> 00:01:36.000]   All Americans today are thinking as well of the families of these men and women
+[00:01:36.000 --> 00:01:40.000]   who have been given this sudden shock and grief.
+[00:01:40.000 --> 00:01:45.000]   You're not alone. Our entire nation grieves with you,
+[00:01:45.000 --> 00:01:52.000]   and those you love will always have the respect and gratitude of this country.
+[00:01:52.000 --> 00:01:56.000]   The cause in which they died will continue.
+[00:01:56.000 --> 00:02:04.000]   Mankind is led into the darkness beyond our world by the inspiration of discovery
+[00:02:04.000 --> 00:02:11.000]   and the longing to understand. Our journey into space will go on.
+[00:02:11.000 --> 00:02:16.000]   In the skies today, we saw destruction and tragedy.
+[00:02:16.000 --> 00:02:22.000]   Yet farther than we can see, there is comfort and hope.
+[00:02:22.000 --> 00:02:29.000]   In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
+[00:02:29.000 --> 00:02:35.000]   who created all these. He who brings out the starry hosts one by one
+[00:02:35.000 --> 00:02:39.000]   and calls them each by name."
+[00:02:39.000 --> 00:02:46.000]   Because of His great power and mighty strength, not one of them is missing.
+[00:02:46.000 --> 00:02:55.000]   The same Creator who names the stars also knows the names of the seven souls we mourn today.
+[00:02:55.000 --> 00:03:01.000]   The crew of the shuttle Columbia did not return safely to earth,
+[00:03:01.000 --> 00:03:05.000]   yet we can pray that all are safely home.
+[00:03:05.000 --> 00:03:13.000]   May God bless the grieving families, and may God continue to bless America.
+[00:03:13.000 --> 00:03:19.000]   [Silence]
+whisper_print_timings:     fallbacks =   1 p /   0 h
+whisper_print_timings:     load time =   569.03 ms
+whisper_print_timings:      mel time =   146.85 ms
+whisper_print_timings:   sample time =   238.66 ms /   553 runs (    0.43 ms per run)
+whisper_print_timings:   encode time = 18665.10 ms /     9 runs ( 2073.90 ms per run)
+whisper_print_timings:   decode time = 13090.93 ms /   549 runs (   23.85 ms per run)
+whisper_print_timings:    total time = 32733.52 ms
 ```
 </details>
 ## Controlling the length of the generated text segments (experimental)
+For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
 ```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
+system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
+system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...

bindings/javascript/whisper.js CHANGED Viewed

The diff for this file is too large to render. See raw diff

examples/main/README.md CHANGED Viewed

@@ -9,25 +9,35 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
 usage: ./main [options] file0.wav file1.wav ...
 options:
-  -h,       --help          [default] show this help message and exit
-  -t N,     --threads N     [4      ] number of threads to use during computation
-  -p N,     --processors N  [1      ] number of processors to use during computation
-  -ot N,    --offset-t N    [0      ] time offset in milliseconds
-  -on N,    --offset-n N    [0      ] segment index offset
-  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
-  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
-  -ml N,    --max-len N     [0      ] maximum segment length in characters
-  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
-  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
-  -tr,      --translate     [false  ] translate from source language to english
-  -otxt,    --output-txt    [false  ] output result in a text file
-  -ovtt,    --output-vtt    [false  ] output result in a vtt file
-  -osrt,    --output-srt    [false  ] output result in a srt file
-  -owts,    --output-words  [false  ] output script for generating karaoke video
-  -ps,      --print-special [false  ] print special tokens
-  -pc,      --print-colors  [false  ] print colors
-  -nt,      --no-timestamps [true   ] do not print timestamps
-  -l LANG,  --language LANG [en     ] spoken language
-  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
-  -f FNAME, --file FNAME    [       ] input WAV file path
 ```

 usage: ./main [options] file0.wav file1.wav ...
 options:
+  -h,        --help              [default] show this help message and exit
+  -t N,      --threads N         [4      ] number of threads to use during computation
+  -p N,      --processors N      [1      ] number of processors to use during computation
+  -ot N,     --offset-t N        [0      ] time offset in milliseconds
+  -on N,     --offset-n N        [0      ] segment index offset
+  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
+  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
+  -ml N,     --max-len N         [0      ] maximum segment length in characters
+  -bo N,     --best-of N         [5      ] number of best candidates to keep
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
+  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
+  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
+  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
+  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,       --translate         [false  ] translate from source language to english
+  -di,       --diarize           [false  ] stereo audio diarization
+  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
+  -otxt,     --output-txt        [false  ] output result in a text file
+  -ovtt,     --output-vtt        [false  ] output result in a vtt file
+  -osrt,     --output-srt        [false  ] output result in a srt file
+  -owts,     --output-words      [false  ] output script for generating karaoke video
+  -ocsv,     --output-csv        [false  ] output result in a CSV file
+  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
+  -ps,       --print-special     [false  ] print special tokens
+  -pc,       --print-colors      [false  ] print colors
+  -pp,       --print-progress    [false  ] print progress
+  -nt,       --no-timestamps     [true   ] do not print timestamps
+  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
+             --prompt PROMPT     [       ] initial prompt
+  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
+  -f FNAME,  --file FNAME        [       ] input WAV file path
 ```

examples/main/main.cpp CHANGED Viewed

@@ -53,22 +53,23 @@ void replace_all(std::string & s, const std::string & search, const std::string
 // command-line parameters
 struct whisper_params {
     int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors = 1;
-    int32_t offset_t_ms  = 0;
-    int32_t offset_n     = 0;
-    int32_t duration_ms  = 0;
     int32_t max_context  = -1;
-    int32_t max_len      = 0;
-    int32_t best_of      = 5;
     int32_t beam_size    = -1;
-    float word_thold    = 0.01f;
-    float entropy_thold = 2.4f;
-    float logprob_thold = -1.0f;
     bool speed_up       = false;
     bool translate      = false;
     bool diarize        = false;
     bool output_txt     = false;
     bool output_vtt     = false;
     bool output_srt     = false;
@@ -117,6 +118,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-su"   || arg == "--speed-up")       { params.speed_up       = true; }
         else if (arg == "-tr"   || arg == "--translate")      { params.translate      = true; }
         else if (arg == "-di"   || arg == "--diarize")        { params.diarize        = true; }
         else if (arg == "-otxt" || arg == "--output-txt")     { params.output_txt     = true; }
         else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
         else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
@@ -162,6 +164,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
     fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
     fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
     fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
     fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
@@ -514,7 +517,7 @@ int main(int argc, char ** argv) {
     for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
         const auto fname_inp = params.fname_inp[f];
-		const auto fname_outp = f < params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
         std::vector<float> pcmf32; // mono-channel F32 PCM
         std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@@ -647,17 +650,19 @@ int main(int argc, char ** argv) {
             wparams.token_timestamps = params.output_wts || params.max_len > 0;
             wparams.thold_pt         = params.word_thold;
-            wparams.entropy_thold    = params.entropy_thold;
-            wparams.logprob_thold    = params.logprob_thold;
             wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
             wparams.speed_up         = params.speed_up;
             wparams.greedy.best_of        = params.best_of;
             wparams.beam_search.beam_size = params.beam_size;
-            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
             whisper_print_user_data user_data = { &params, &pcmf32s };

 // command-line parameters
 struct whisper_params {
     int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_processors =  1;
+    int32_t offset_t_ms  =  0;
+    int32_t offset_n     =  0;
+    int32_t duration_ms  =  0;
     int32_t max_context  = -1;
+    int32_t max_len      =  0;
+    int32_t best_of      =  5;
     int32_t beam_size    = -1;
+    float word_thold    =  0.01f;
+    float entropy_thold =  2.40f;
+    float logprob_thold = -1.00f;
     bool speed_up       = false;
     bool translate      = false;
     bool diarize        = false;
+    bool no_fallback    = false;
     bool output_txt     = false;
     bool output_vtt     = false;
     bool output_srt     = false;
         else if (arg == "-su"   || arg == "--speed-up")       { params.speed_up       = true; }
         else if (arg == "-tr"   || arg == "--translate")      { params.translate      = true; }
         else if (arg == "-di"   || arg == "--diarize")        { params.diarize        = true; }
+        else if (arg == "-nf"   || arg == "--no-fallback")    { params.no_fallback    = true; }
         else if (arg == "-otxt" || arg == "--output-txt")     { params.output_txt     = true; }
         else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
         else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
     fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
     fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
     fprintf(stderr, "  -otxt,     --output-txt        [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
     fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
     fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
     for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
         const auto fname_inp = params.fname_inp[f];
+		const auto fname_outp = f < (int) params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
         std::vector<float> pcmf32; // mono-channel F32 PCM
         std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
             wparams.token_timestamps = params.output_wts || params.max_len > 0;
             wparams.thold_pt         = params.word_thold;
             wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
             wparams.speed_up         = params.speed_up;
+            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
             wparams.greedy.best_of        = params.best_of;
             wparams.beam_search.beam_size = params.beam_size;
+            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
+            wparams.entropy_thold    = params.entropy_thold;
+            wparams.logprob_thold    = params.logprob_thold;
             whisper_print_user_data user_data = { &params, &pcmf32s };

ggml.c CHANGED Viewed

@@ -1258,7 +1258,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 //
 struct ggml_object {
-    size_t offset;
     size_t size;
     struct ggml_object * next;
@@ -1284,6 +1284,9 @@ struct ggml_context {
     struct ggml_object * objects_begin;
     struct ggml_object * objects_end;
 };
 struct ggml_context_container {
@@ -1346,7 +1349,7 @@ inline static void ggml_critical_section_end(void) {
 void ggml_print_object(const struct ggml_object * obj) {
     GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
-            obj->offset, obj->size, (const void *) obj->next);
 }
 void ggml_print_objects(const struct ggml_context * ctx) {
@@ -1542,12 +1545,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
     }
     *ctx = (struct ggml_context) {
-        .mem_size         = params.mem_size,
-        .mem_buffer       = params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
-        .mem_buffer_owned = params.mem_buffer ? false : true,
-        .n_objects        = 0,
-        .objects_begin    = NULL,
-        .objects_end      = NULL,
     };
     ggml_assert_aligned(ctx->mem_buffer);
@@ -1570,7 +1575,7 @@ void ggml_free(struct ggml_context * ctx) {
             g_state.contexts[i].used = false;
             GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offset + ctx->objects_end->size);
             if (ctx->mem_buffer_owned) {
                 free(ctx->mem_buffer);
@@ -1589,7 +1594,15 @@ void ggml_free(struct ggml_context * ctx) {
 }
 size_t ggml_used_mem(const struct ggml_context * ctx) {
-    return ctx->objects_end->offset + ctx->objects_end->size;
 }
 ////////////////////////////////////////////////////////////////////////////////
@@ -1603,9 +1616,9 @@ struct ggml_tensor * ggml_new_tensor_impl(
     // always insert objects at the end of the context's memory pool
     struct ggml_object * obj_cur = ctx->objects_end;
-    const size_t cur_offset = obj_cur == NULL ? 0 : obj_cur->offset;
-    const size_t cur_size   = obj_cur == NULL ? 0 : obj_cur->size;
-    const size_t cur_end    = cur_offset + cur_size;
     size_t size_needed = 0;
@@ -1616,25 +1629,52 @@ struct ggml_tensor * ggml_new_tensor_impl(
         }
         // align to GGML_MEM_ALIGN
         size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
-    }
-    size_needed += sizeof(struct ggml_tensor);
-    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-        GGML_PRINT("%s: not enough space in the context's memory pool\n", __func__);
-        assert(false);
-        return NULL;
     }
     char * const mem_buffer = ctx->mem_buffer;
     struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
-    *obj_new = (struct ggml_object) {
-        .offset = cur_end + GGML_OBJECT_SIZE,
-        .size   = size_needed,
-        .next   = NULL,
-    };
     if (obj_cur != NULL) {
         obj_cur->next = obj_new;
@@ -1645,9 +1685,9 @@ struct ggml_tensor * ggml_new_tensor_impl(
     ctx->objects_end = obj_new;
-    //GGML_PRINT_DEBUG("%s: inserted new object at %zu\n", __func__, cur_end);
-    struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offset);
     ggml_assert_aligned(result);
@@ -1690,7 +1730,7 @@ struct ggml_tensor * ggml_new_tensor(
         struct ggml_context * ctx,
         enum   ggml_type type,
         int    n_dims,
-        const int* ne) {
     return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
 }
@@ -1732,16 +1772,26 @@ struct ggml_tensor * ggml_new_tensor_4d(
 }
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
     ggml_set_i32(result, value);
     return result;
 }
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
     ggml_set_f32(result, value);
     return result;
@@ -2350,7 +2400,7 @@ struct ggml_tensor * ggml_repeat(
     result->op   = GGML_OP_REPEAT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
-    result->src1 = NULL;
     return result;
 }
@@ -2966,9 +3016,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
     // TODO: when implement backward, fix this:
     //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-    ((int32_t *) b->data)[0] = n_past;
     result->op   = GGML_OP_DIAG_MASK_INF;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -4300,7 +4348,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
     const int ne1 = dst->ne[1];
     // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
         //printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
         return true;
     }
@@ -7289,6 +7339,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                                     node->n_tasks = 1; // TODO: this actually is doing nothing
                                                        //       the threads are still spinning
                                     cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
                                 } else {
                                     cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
                                 }

 //
 struct ggml_object {
+    size_t offs;
     size_t size;
     struct ggml_object * next;
     struct ggml_object * objects_begin;
     struct ggml_object * objects_end;
+    struct ggml_scratch scratch;
+    struct ggml_scratch scratch_save;
 };
 struct ggml_context_container {
 void ggml_print_object(const struct ggml_object * obj) {
     GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
+            obj->offs, obj->size, (const void *) obj->next);
 }
 void ggml_print_objects(const struct ggml_context * ctx) {
     }
     *ctx = (struct ggml_context) {
+        /*.mem_size         =*/ params.mem_size,
+        /*.mem_buffer       =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
+        /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
+        /*.n_objects        =*/ 0,
+        /*.objects_begin    =*/ NULL,
+        /*.objects_end      =*/ NULL,
+        /*.scratch          =*/ { 0, 0, NULL, },
+        /*.scratch_save     =*/ { 0, 0, NULL, },
     };
     ggml_assert_aligned(ctx->mem_buffer);
             g_state.contexts[i].used = false;
             GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
+                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
             if (ctx->mem_buffer_owned) {
                 free(ctx->mem_buffer);
 }
 size_t ggml_used_mem(const struct ggml_context * ctx) {
+    return ctx->objects_end->offs + ctx->objects_end->size;
+}
+size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
+    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
+    ctx->scratch = scratch;
+    return result;
 }
 ////////////////////////////////////////////////////////////////////////////////
     // always insert objects at the end of the context's memory pool
     struct ggml_object * obj_cur = ctx->objects_end;
+    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
+    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
+    const size_t cur_end  = cur_offs + cur_size;
     size_t size_needed = 0;
         }
         // align to GGML_MEM_ALIGN
         size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
     }
     char * const mem_buffer = ctx->mem_buffer;
     struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
+    if (ctx->scratch.data == NULL || data != NULL) {
+        size_needed += sizeof(struct ggml_tensor);
+        if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
+            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+                    __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
+            assert(false);
+            return NULL;
+        }
+        *obj_new = (struct ggml_object) {
+            .offs = cur_end + GGML_OBJECT_SIZE,
+            .size = size_needed,
+            .next = NULL,
+        };
+    } else {
+        if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
+            GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
+            assert(false);
+            return NULL;
+        }
+        if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
+            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+                    __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
+            assert(false);
+            return NULL;
+        }
+        data = (char * const) ctx->scratch.data + ctx->scratch.offs;
+        *obj_new = (struct ggml_object) {
+            .offs = cur_end + GGML_OBJECT_SIZE,
+            .size = sizeof(struct ggml_tensor),
+            .next = NULL,
+        };
+        //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
+        ctx->scratch.offs += size_needed;
+    }
     if (obj_cur != NULL) {
         obj_cur->next = obj_new;
     ctx->objects_end = obj_new;
+    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
+    struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
     ggml_assert_aligned(result);
         struct ggml_context * ctx,
         enum   ggml_type type,
         int    n_dims,
+        const int * ne) {
     return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
 }
 }
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
+    ctx->scratch_save = ctx->scratch;
+    ctx->scratch.data = NULL;
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
+    ctx->scratch = ctx->scratch_save;
     ggml_set_i32(result, value);
     return result;
 }
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
+    ctx->scratch_save = ctx->scratch;
+    ctx->scratch.data = NULL;
     struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    ctx->scratch = ctx->scratch_save;
     ggml_set_f32(result, value);
     return result;
     result->op   = GGML_OP_REPEAT;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src0 = a;
+    result->src1 = b;
     return result;
 }
     // TODO: when implement backward, fix this:
     //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
     struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+    struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
     result->op   = GGML_OP_DIAG_MASK_INF;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     const int ne1 = dst->ne[1];
     // TODO: find the optimal values for these
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && (
+             (ne0 >= 32 && ne1  >= 32   && ne10 >= 32)
+            )) {
         //printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
         return true;
     }
                                     node->n_tasks = 1; // TODO: this actually is doing nothing
                                                        //       the threads are still spinning
                                     cur = sizeof(float)*(node->src0->ne[0]*node->src0->ne[1]);
+                                    //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
+                                    //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
+                                    //printf("cur = %zu\n", cur);
                                 } else {
                                     cur = sizeof(ggml_fp16_t)*ggml_nelements(node->src1);
                                 }

ggml.h CHANGED Viewed

@@ -301,6 +301,13 @@ struct ggml_cgraph {
     int64_t perf_time_us;
 };
 struct ggml_init_params {
     // memory pool
     size_t mem_size;   // bytes
@@ -327,6 +334,8 @@ void ggml_free(struct ggml_context * ctx);
 size_t ggml_used_mem(const struct ggml_context * ctx);
 struct ggml_tensor * ggml_new_tensor(
         struct ggml_context * ctx,
         enum   ggml_type type,

     int64_t perf_time_us;
 };
+// scratch buffer
+struct ggml_scratch {
+    size_t offs;
+    size_t size;
+    void * data;
+};
 struct ggml_init_params {
     // memory pool
     size_t mem_size;   // bytes
 size_t ggml_used_mem(const struct ggml_context * ctx);
+size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
 struct ggml_tensor * ggml_new_tensor(
         struct ggml_context * ctx,
         enum   ggml_type type,

whisper.cpp CHANGED Viewed

@@ -103,6 +103,9 @@ static void byteswap_tensor(ggml_tensor * tensor) {
 //#define WHISPER_USE_FLASH_FF
 #define WHISPER_MAX_DECODERS 16
 // available whisper models
 enum e_model {
     MODEL_UNKNOWN,
@@ -217,6 +220,38 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
 static const size_t MB = 1024*1024;
 static const std::map<e_model, size_t> MEM_REQ_MODEL = {
     { MODEL_TINY,     74ull*MB },
     { MODEL_BASE,    142ull*MB },
@@ -242,35 +277,19 @@ static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
 };
 static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
-    { MODEL_TINY,     80ull*MB },
-    { MODEL_BASE,    128ull*MB },
-    { MODEL_SMALL,   300ull*MB },
-    { MODEL_MEDIUM,  680ull*MB },
-    { MODEL_LARGE,  1100ull*MB },
-};
-static const std::map<e_model, size_t> MEM_REQ_ENCODE_LAYER = {
-    { MODEL_TINY,    104ull*MB },
-    { MODEL_BASE,    138ull*MB },
-    { MODEL_SMALL,   208ull*MB },
-    { MODEL_MEDIUM,  280ull*MB },
-    { MODEL_LARGE,   354ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_DECODE = {
-    { MODEL_TINY,    200ull*MB },
-    { MODEL_BASE,    202ull*MB },
-    { MODEL_SMALL,   204ull*MB },
-    { MODEL_MEDIUM,  206ull*MB },
-    { MODEL_LARGE,   208ull*MB },
-};
-static const std::map<e_model, size_t> MEM_REQ_DECODE_LAYER = {
-    { MODEL_TINY,     32ull*MB },
-    { MODEL_BASE,     44ull*MB },
-    { MODEL_SMALL,    64ull*MB },
-    { MODEL_MEDIUM,   84ull*MB },
-    { MODEL_LARGE,   110ull*MB },
 };
 struct whisper_mel {
@@ -557,7 +576,10 @@ struct whisper_context {
     // memory buffers used by encode / decode contexts
     std::vector<uint8_t> buf_compute;
-    std::vector<uint8_t> buf_compute_layer;
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
@@ -578,6 +600,37 @@ struct whisper_context {
     // [EXPERIMENTAL] speed-up techniques
     int32_t exp_n_audio_ctx; // 0 - use default
 };
 template<typename T>
@@ -744,10 +797,13 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         {
             // this is the total memory required to run the inference
             const size_t mem_required =
-                scale*MEM_REQ_MODEL.at       (model.type) +
-                scale*MEM_REQ_KV_CROSS.at    (model.type) +
-                scale*std::max(MEM_REQ_ENCODE.at(model.type),       MEM_REQ_DECODE.at(model.type)) +
-                scale*std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type));
             // this is the memory required by one decoder
             const size_t mem_required_decoder =
@@ -783,8 +839,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
         }
-        wctx.buf_compute.resize      (scale*std::max(MEM_REQ_ENCODE.at(model.type),       MEM_REQ_DECODE.at(model.type)));
-        wctx.buf_compute_layer.resize(scale*std::max(MEM_REQ_ENCODE_LAYER.at(model.type), MEM_REQ_DECODE_LAYER.at(model.type)));
     }
     // load mel filters
@@ -1317,6 +1377,8 @@ static bool whisper_encode(
     struct ggml_context * ctx0 = ggml_init(params);
     struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
     assert(mel->type == GGML_TYPE_F32);
     {
@@ -1337,6 +1399,8 @@ static bool whisper_encode(
     // convolution + gelu
     {
         cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
         cur = ggml_add(ctx0,
                 ggml_repeat(ctx0,
@@ -1346,6 +1410,8 @@ static bool whisper_encode(
         cur = ggml_gelu(ctx0, cur);
         cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
         cur = ggml_add(ctx0,
                 ggml_repeat(ctx0,
@@ -1356,6 +1422,8 @@ static bool whisper_encode(
         cur = ggml_gelu(ctx0, cur);
     }
     // ===================================================================
     // NOTE: experimenting with partial evaluation of the encoder (ignore)
     //static int iter = -1;
@@ -1376,6 +1444,7 @@ static bool whisper_encode(
     struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
     cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
     // ===================================================================
     // original:
@@ -1386,153 +1455,158 @@ static bool whisper_encode(
     for (int il = 0; il < n_layer; ++il) {
         const auto & layer = model.layers_encoder[il];
-        // create separate context for each layer to reduce memory usage
-        struct ggml_init_params paramsL;
-        paramsL.mem_size   = wctx.buf_compute_layer.size();
-        paramsL.mem_buffer = wctx.buf_compute_layer.data();
-        struct ggml_context * ctxL = ggml_init(paramsL);
         // norm
         {
-            cur = ggml_norm(ctxL, inpL);
             // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctxL,
-                    ggml_mul(ctxL,
-                        ggml_repeat(ctxL, layer.attn_ln_0_w, cur),
                         cur),
-                    ggml_repeat(ctxL, layer.attn_ln_0_b, cur));
         }
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctxL,
                     layer.attn_q_w,
                     cur);
-            Qcur = ggml_add(ctxL,
-                    ggml_repeat(ctxL,
                         layer.attn_q_b,
                         Qcur),
                     Qcur);
-            //Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
             // note: no bias for Key
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctxL,
                     layer.attn_k_w,
                     cur);
-            //Kcur = ggml_scale(ctxL, Kcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctxL,
                     layer.attn_v_w,
                     cur);
-            Vcur = ggml_add(ctxL,
-                    ggml_repeat(ctxL,
                         layer.attn_v_b,
                         Vcur),
                     Vcur);
             // ------
 #ifdef WHISPER_USE_FLASH_ATTN
             struct ggml_tensor * Q =
-                ggml_permute(ctxL,
-                        ggml_cpy(ctxL,
                             Qcur,
-                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
             struct ggml_tensor * K =
-                ggml_permute(ctxL,
-                        ggml_cpy(ctxL,
                             Kcur,
-                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
             struct ggml_tensor * V =
-                ggml_cpy(ctxL,
-                        ggml_permute(ctxL,
-                            ggml_reshape_3d(ctxL,
                                 Vcur,
                                 n_state/n_head, n_head, n_ctx),
                             1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctxL, wctx.wtype, n_ctx, n_state/n_head, n_head)
                         );
-            struct ggml_tensor * KQV = ggml_flash_attn(ctxL, Q, K, V, false);
 #else
             struct ggml_tensor * Q =
-                ggml_permute(ctxL,
-                        ggml_cpy(ctxL,
                             Qcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
             struct ggml_tensor * K =
-                ggml_permute(ctxL,
-                        ggml_cpy(ctxL,
                             Kcur,
-                            ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q);
             struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctxL,
                         KQ,
-                        ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head))
                         );
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ_scaled);
             //struct ggml_tensor * V_trans =
-            //    ggml_permute(ctxL,
-            //            ggml_cpy(ctxL,
             //                Vcur,
-            //                ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_head, n_ctx)),
             //            1, 2, 0, 3);
-            //struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
             struct ggml_tensor * V =
-                ggml_cpy(ctxL,
-                        ggml_permute(ctxL,
-                            ggml_reshape_3d(ctxL,
                                 Vcur,
                                 n_state/n_head, n_head, n_ctx),
                             0, 2, 1, 3),
-                        ggml_new_tensor_3d(ctxL, wctx.wtype, n_state/n_head, n_ctx, n_head)
                         );
-            struct ggml_tensor * KQV = ggml_mul_mat(ctxL, ggml_transpose(ctxL, V), KQ_soft_max);
 #endif
-            struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3);
-            cur = ggml_cpy(ctxL,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, n_ctx));
         }
         // projection
         {
-            cur = ggml_mul_mat(ctxL,
                     layer.attn_ln_1_w,
                     cur);
-            cur = ggml_add(ctxL,
-                    ggml_repeat(ctxL, layer.attn_ln_1_b, cur),
                     cur);
         }
         // add the input
-        cur = ggml_add(ctxL, cur, inpL);
         struct ggml_tensor * inpFF = cur;
@@ -1540,75 +1614,75 @@ static bool whisper_encode(
         {
             // norm
             {
-                cur = ggml_norm(ctxL, inpFF);
                 // cur = mlp_ln_w*cur + mlp_ln_b
-                cur = ggml_add(ctxL,
-                        ggml_mul(ctxL,
-                            ggml_repeat(ctxL, layer.mlp_ln_w, cur),
                             cur),
-                        ggml_repeat(ctxL, layer.mlp_ln_b, cur));
             }
 #ifdef WHISPER_USE_FLASH_FF
-            cur = ggml_flash_ff(ctxL,
-                    ggml_cpy(ctxL, cur, ggml_new_tensor_2d(ctxL, wctx.wtype, n_state, N)),
                     layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
             // fully connected
-            cur = ggml_mul_mat(ctxL,
                     layer.mlp_0_w,
                     cur);
-            cur = ggml_add(ctxL,
-                    ggml_repeat(ctxL, layer.mlp_0_b, cur),
                     cur);
             // GELU activation
-            cur = ggml_gelu(ctxL, cur);
             // projection
-            cur = ggml_mul_mat(ctxL,
                     layer.mlp_1_w,
                     cur);
-            cur = ggml_add(ctxL,
-                    ggml_repeat(ctxL, layer.mlp_1_b, cur),
                     cur);
 #endif
         }
-        // output from this layer
-        struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF);
-        {
-            struct ggml_cgraph gf = {};
-            gf.n_threads = n_threads;
-            ggml_build_forward_expand(&gf, inpO);
-            ggml_graph_compute       (ctxL, &gf);
-            //ggml_graph_print(&gf);
-        }
-        // TODO: this is a hack to have per-layer computation graphs - need to come up with something better
-        // input for next layer (inpO -> inpL)
-        memcpy(inpL->data, inpO->data, ggml_nbytes(inpL));
-        inpL->op = GGML_OP_NONE;
-        inpL->src0 = nullptr;
-        inpL->src1 = nullptr;
-        //printf("%s: - used_mem(%d) = %f MB\n", __func__, il, ggml_used_mem(ctxL)/1024.0/1024.0);
-        ggml_free(ctxL);
     }
     cur = inpL;
     // norm
     {
         cur = ggml_norm(ctx0, cur);
         // cur = ln_f_g*cur + ln_f_b
         cur = ggml_add(ctx0,
                 ggml_mul(ctx0,
@@ -1617,6 +1691,8 @@ static bool whisper_encode(
                 ggml_repeat(ctx0, model.e_ln_b, cur));
     }
     // run the computation
     {
         struct ggml_cgraph gf = {};
@@ -1655,12 +1731,16 @@ static bool whisper_encode(
         for (int il = 0; il < model.hparams.n_text_layer; ++il) {
             auto & layer = model.layers_decoder[il];
             struct ggml_tensor * Kcross = ggml_mul_mat(ctx0,
                     layer.cross_attn_k_w,
                     cur);
             Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             struct ggml_tensor * Vcross = ggml_mul_mat(ctx0,
                     layer.cross_attn_v_w,
                     cur);
@@ -1671,6 +1751,8 @@ static bool whisper_encode(
                         Vcross),
                     Vcross);
             //struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_cross.k, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
             //struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_cross.v, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
             struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_cross.k, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.k)*n_state)*(il*n_ctx));
@@ -1686,7 +1768,12 @@ static bool whisper_encode(
     ////////////////////////////////////////////////////////////////////////////
-    //printf("%s: used_mem = %f MB\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0);
     ggml_free(ctx0);
@@ -1698,7 +1785,7 @@ static bool whisper_encode(
 // evaluate the decoder
 //
-// given text prompt + audio features -> predicts the probabilities for the next token
 //
 //   - model:      the model
 //   - n_threads:  number of threads to use
@@ -1742,6 +1829,9 @@ static bool whisper_decode(
     struct ggml_context * ctx0 = ggml_init(params);
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
@@ -1750,6 +1840,8 @@ static bool whisper_decode(
         ((int32_t *) position->data)[i] = n_past + i;
     }
     // token encoding + position encoding
     struct ggml_tensor * cur =
         ggml_add(ctx0,
@@ -1761,211 +1853,248 @@ static bool whisper_decode(
     for (int il = 0; il < n_layer; ++il) {
         const auto & layer = model.layers_decoder[il];
-        struct ggml_init_params paramsL;
-        paramsL.mem_size   = wctx.buf_compute_layer.size();
-        paramsL.mem_buffer = wctx.buf_compute_layer.data();
-        struct ggml_context * ctxL = ggml_init(paramsL);
-        struct ggml_cgraph gf = {};
-        gf.n_threads = n_threads;
         // norm
         {
-            cur = ggml_norm(ctxL, inpL);
             // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctxL,
-                    ggml_mul(ctxL,
-                        ggml_repeat(ctxL, layer.attn_ln_0_w, cur),
                         cur),
-                    ggml_repeat(ctxL, layer.attn_ln_0_b, cur));
         }
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctxL,
                     layer.attn_q_w,
                     cur);
-            Qcur = ggml_add(ctxL,
-                    ggml_repeat(ctxL,
                         layer.attn_q_b,
                         Qcur),
                     Qcur);
-            Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
             // note: no bias for Key
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctxL,
                     layer.attn_k_w,
                     cur);
-            Kcur = ggml_scale(ctxL, Kcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
-            struct ggml_tensor * Vcur = ggml_mul_mat(ctxL,
                     layer.attn_v_w,
                     cur);
-            Vcur = ggml_add(ctxL,
-                    ggml_repeat(ctxL,
                         layer.attn_v_b,
                         Vcur),
                     Vcur);
             // store key and value to memory
             {
-                struct ggml_tensor * k = ggml_view_1d(ctxL, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctxL, kv_self.v, N*n_state, (ggml_element_size(kv_self.v)*n_state)*(il*n_ctx + n_past));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctxL, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctxL, Vcur, v));
             }
             // ------
             struct ggml_tensor * Q =
-                ggml_permute(ctxL,
-                        ggml_cpy(ctxL,
                             Qcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)),
                         0, 2, 1, 3);
             struct ggml_tensor * K =
-                ggml_permute(ctxL,
-                        ggml_reshape_3d(ctxL,
-                            ggml_view_1d(ctxL, kv_self.k, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.k)*n_state),
                             n_state/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q);
             //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctxL,
             //            KQ,
-            //            ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head))
             //            );
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctxL, KQ, n_past);
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ_masked);
             struct ggml_tensor * V_trans =
-                ggml_permute(ctxL,
-                        ggml_reshape_3d(ctxL,
-                            ggml_view_1d(ctxL, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
                             n_state/n_head, n_head, n_past + N),
                         1, 2, 0, 3);
-            struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
-            struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3);
-            cur = ggml_cpy(ctxL,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N));
         }
         {
-            cur = ggml_mul_mat(ctxL,
                     layer.attn_ln_1_w,
                     cur);
-            cur = ggml_add(ctxL,
-                    ggml_repeat(ctxL, layer.attn_ln_1_b, cur),
                     cur);
         }
         // add the input
-        struct ggml_tensor * inpCA = ggml_add(ctxL, cur, inpL);
         // norm
         {
-            cur = ggml_norm(ctxL, inpCA); // note: we use inpCA here
             // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctxL,
-                    ggml_mul(ctxL,
-                        ggml_repeat(ctxL, layer.cross_attn_ln_0_w, cur),
                         cur),
-                    ggml_repeat(ctxL, layer.cross_attn_ln_0_b, cur));
         }
         // cross-attention
         {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctxL,
                     layer.cross_attn_q_w,
                     cur);
-            Qcur = ggml_add(ctxL,
-                    ggml_repeat(ctxL,
                         layer.cross_attn_q_b,
                         Qcur),
                     Qcur);
-            Qcur = ggml_scale(ctxL, Qcur, ggml_new_f32(ctxL, pow(float(n_state)/n_head, -0.25)));
             // Kcross is already scaled
             struct ggml_tensor * Kcross =
-                ggml_reshape_3d(ctxL,
-                        ggml_view_1d(ctxL, wctx.kv_cross.k, M*n_state, il*M*ggml_element_size(wctx.kv_cross.k)*n_state),
                         n_state/n_head, n_head, M);
             struct ggml_tensor * Vcross =
-                ggml_reshape_3d(ctxL,
-                        ggml_view_1d(ctxL, wctx.kv_cross.v, M*n_state, il*M*ggml_element_size(wctx.kv_cross.v)*n_state),
                         n_state/n_head, n_head, M);
             // ------
             struct ggml_tensor * Q =
-                ggml_permute(ctxL,
-                        ggml_cpy(ctxL,
                             Qcur,
-                            ggml_new_tensor_3d(ctxL, GGML_TYPE_F32, n_state/n_head, n_head, N)),
                         0, 2, 1, 3);
-            struct ggml_tensor * K = ggml_permute(ctxL, Kcross, 0, 2, 1, 3);
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctxL, K, Q);
             //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctxL,
             //            KQ,
-            //            ggml_new_f32(ctxL, 1.0f/sqrt(float(n_state)/n_head))
             //            );
             // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctxL, KQ_scaled, n_past);
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctxL, KQ);
-            struct ggml_tensor * V_trans = ggml_permute(ctxL, Vcross, 1, 2, 0, 3);
-            struct ggml_tensor * KQV = ggml_mul_mat(ctxL, V_trans, KQ_soft_max);
-            struct ggml_tensor * KQV_merged = ggml_permute(ctxL, KQV, 0, 2, 1, 3);
             // cur = KQV_merged.contiguous().view(n_state, N)
-            cur = ggml_cpy(ctxL,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctxL, GGML_TYPE_F32, n_state, N));
         }
         // projection
         {
-            cur = ggml_mul_mat(ctxL,
                     layer.cross_attn_ln_1_w,
                     cur);
-            cur = ggml_add(ctxL,
-                    ggml_repeat(ctxL, layer.cross_attn_ln_1_b, cur),
                     cur);
         }
         // add the input
-        cur = ggml_add(ctxL, cur, inpCA);
         struct ggml_tensor * inpFF = cur;
@@ -1973,68 +2102,67 @@ static bool whisper_decode(
         {
             // norm
             {
-                cur = ggml_norm(ctxL, inpFF);
                 // cur = mlp_ln_w*cur + mlp_ln_b
-                cur = ggml_add(ctxL,
-                        ggml_mul(ctxL,
-                            ggml_repeat(ctxL, layer.mlp_ln_w, cur),
                             cur),
-                        ggml_repeat(ctxL, layer.mlp_ln_b, cur));
             }
             // fully connected
-            cur = ggml_mul_mat(ctxL,
                     layer.mlp_0_w,
                     cur);
-            cur = ggml_add(ctxL,
-                    ggml_repeat(ctxL, layer.mlp_0_b, cur),
                     cur);
             // GELU activation
-            cur = ggml_gelu(ctxL, cur);
             // projection
-            cur = ggml_mul_mat(ctxL,
                     layer.mlp_1_w,
                     cur);
-            cur = ggml_add(ctxL,
-                    ggml_repeat(ctxL, layer.mlp_1_b, cur),
-                    cur);
-        }
-        // output from this layer
-        struct ggml_tensor * inpO = ggml_add(ctxL, cur, inpFF);
-        {
-            ggml_build_forward_expand(&gf, inpO);
-            ggml_graph_compute       (ctxL, &gf);
-            //ggml_graph_print(&gf);
         }
-        // TODO: this is a hack to have per-layer computation graphs - need to come up with something better
-        // input for next layer (inpO -> inpL)
-        memcpy(inpL->data, inpO->data, ggml_nbytes(inpL));
-        inpL->op = GGML_OP_NONE;
-        inpL->src0 = nullptr;
-        inpL->src1 = nullptr;
-        if (N > 1) {
-            //printf("%s: - used_mem(%d) = %f MB\n", __func__, il, ggml_used_mem(ctxL)/1024.0/1024.0);
-        }
-        ggml_free(ctxL);
     }
     cur = inpL;
     // norm
     {
         cur = ggml_norm(ctx0, cur);
         cur = ggml_add(ctx0,
                 ggml_mul(ctx0,
                     ggml_repeat(ctx0, model.d_ln_w, cur),
@@ -2042,24 +2170,38 @@ static bool whisper_decode(
                 ggml_repeat(ctx0, model.d_ln_b, cur));
     }
     struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
     // run the computation
     {
-        struct ggml_cgraph gf = {};
-        gf.n_threads = n_threads;
         ggml_build_forward_expand(&gf, logits);
         ggml_graph_compute       (ctx0, &gf);
     }
-    logits_out.resize(N*n_vocab);
-    memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
     if (N > 1) {
-        //const float mem_per_token = ggml_used_mem(ctx0)/1024.0/1024.0/N;
-        //printf("%s: used_mem = %f MB / %f per token\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0, mem_per_token);
-        //printf("%s: max mem = %f MB\n", __func__, mem_per_token*model.hparams.n_text_ctx);
     }
     ggml_free(ctx0);

 //#define WHISPER_USE_FLASH_FF
 #define WHISPER_MAX_DECODERS 16
+#define WHISPER_USE_SCRATCH
+#define WHISPER_MAX_SCRATCH_BUFFERS 16
 // available whisper models
 enum e_model {
     MODEL_UNKNOWN,
 static const size_t MB = 1024*1024;
+static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
+    { MODEL_TINY,     12ull*MB },
+    { MODEL_BASE,     15ull*MB },
+    { MODEL_SMALL,    23ull*MB },
+    { MODEL_MEDIUM,   31ull*MB },
+    { MODEL_LARGE,    38ull*MB },
+};
+static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
+    { MODEL_TINY,     18ull*MB },
+    { MODEL_BASE,     24ull*MB },
+    { MODEL_SMALL,    36ull*MB },
+    { MODEL_MEDIUM,   48ull*MB },
+    { MODEL_LARGE,    60ull*MB },
+};
+static const std::map<e_model, size_t> MEM_REQ_SCRATCH2 = {
+    { MODEL_TINY,      4ull*MB },
+    { MODEL_BASE,      4ull*MB },
+    { MODEL_SMALL,     6ull*MB },
+    { MODEL_MEDIUM,    7ull*MB },
+    { MODEL_LARGE,     9ull*MB },
+};
+static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
+    { MODEL_TINY,      4ull*MB },
+    { MODEL_BASE,      4ull*MB },
+    { MODEL_SMALL,     6ull*MB },
+    { MODEL_MEDIUM,    7ull*MB },
+    { MODEL_LARGE,     9ull*MB },
+};
 static const std::map<e_model, size_t> MEM_REQ_MODEL = {
     { MODEL_TINY,     74ull*MB },
     { MODEL_BASE,    142ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
+    { MODEL_TINY,      6ull*MB },
+    { MODEL_BASE,      8ull*MB },
+    { MODEL_SMALL,    13ull*MB },
+    { MODEL_MEDIUM,   22ull*MB },
+    { MODEL_LARGE,    33ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_DECODE = {
+    { MODEL_TINY,      3ull*MB },
+    { MODEL_BASE,      5ull*MB },
+    { MODEL_SMALL,    10ull*MB },
+    { MODEL_MEDIUM,   18ull*MB },
+    { MODEL_LARGE,    27ull*MB },
 };
 struct whisper_mel {
     // memory buffers used by encode / decode contexts
     std::vector<uint8_t> buf_compute;
+    std::vector<uint8_t> buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS];
+    int    buf_last = 0;
+    size_t buf_max_size[WHISPER_MAX_SCRATCH_BUFFERS] = { 0 };
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     // [EXPERIMENTAL] speed-up techniques
     int32_t exp_n_audio_ctx; // 0 - use default
+    void use_buf(struct ggml_context * ctx, int i) {
+#if defined(WHISPER_USE_SCRATCH)
+        size_t last_size = 0;
+        if (i == -1) {
+            last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
+        } else {
+            auto & buf = buf_scratch[i];
+            last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
+        }
+        if (buf_last >= 0) {
+            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
+        }
+        buf_last = i;
+#else
+        (void) i;
+        (void) ctx;
+#endif
+    }
+    size_t get_buf_max_mem(int i) const {
+#if defined(WHISPER_USE_SCRATCH)
+        return buf_max_size[i];
+#else
+        (void) i;
+        return 0;
+#endif
+    }
 };
 template<typename T>
         {
             // this is the total memory required to run the inference
             const size_t mem_required =
+                     MEM_REQ_SCRATCH0.at (model.type) +
+                     MEM_REQ_SCRATCH1.at (model.type) +
+                     MEM_REQ_SCRATCH2.at (model.type) +
+                     MEM_REQ_SCRATCH3.at (model.type) +
+                scale*MEM_REQ_MODEL.at   (model.type) +
+                scale*MEM_REQ_KV_CROSS.at(model.type) +
+                scale*std::max(MEM_REQ_ENCODE.at(model.type),       MEM_REQ_DECODE.at(model.type));
             // this is the memory required by one decoder
             const size_t mem_required_decoder =
             fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
         }
+        wctx.buf_compute.resize(scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
+        wctx.buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(model.type));
+        wctx.buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(model.type));
+        wctx.buf_scratch[2].resize(MEM_REQ_SCRATCH2.at(model.type));
+        wctx.buf_scratch[3].resize(MEM_REQ_SCRATCH3.at(model.type));
     }
     // load mel filters
     struct ggml_context * ctx0 = ggml_init(params);
+    wctx.use_buf(ctx0, 0);
     struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
     assert(mel->type == GGML_TYPE_F32);
     {
     // convolution + gelu
     {
+        wctx.use_buf(ctx0, 1);
         cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
         cur = ggml_add(ctx0,
                 ggml_repeat(ctx0,
         cur = ggml_gelu(ctx0, cur);
+        wctx.use_buf(ctx0, 0);
         cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
         cur = ggml_add(ctx0,
                 ggml_repeat(ctx0,
         cur = ggml_gelu(ctx0, cur);
     }
+    wctx.use_buf(ctx0, 3);
     // ===================================================================
     // NOTE: experimenting with partial evaluation of the encoder (ignore)
     //static int iter = -1;
     struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
     cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
     // ===================================================================
     // original:
     for (int il = 0; il < n_layer; ++il) {
         const auto & layer = model.layers_encoder[il];
         // norm
         {
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_norm(ctx0, inpL);
             // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
                         cur),
+                    ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
         }
         // self-attention
         {
+            wctx.use_buf(ctx0, 1);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                     layer.attn_q_w,
                     cur);
+            Qcur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
                         layer.attn_q_b,
                         Qcur),
                     Qcur);
+            //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // note: no bias for Key
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                     layer.attn_k_w,
                     cur);
+            //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                     layer.attn_v_w,
                     cur);
+            Vcur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
                         layer.attn_v_b,
                         Vcur),
                     Vcur);
             // ------
+            wctx.use_buf(ctx0, 0);
 #ifdef WHISPER_USE_FLASH_ATTN
             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
                             Qcur,
+                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
             struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
                             Kcur,
+                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
             struct ggml_tensor * V =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
                                 Vcur,
                                 n_state/n_head, n_head, n_ctx),
                             1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
                         );
+            struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
 #else
             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
                             Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
             struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
                             Kcur,
+                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
                         0, 2, 1, 3);
             // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
                         KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
                         );
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
             //struct ggml_tensor * V_trans =
+            //    ggml_permute(ctx0,
+            //            ggml_cpy(ctx0,
             //                Vcur,
+            //                ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
             //            1, 2, 0, 3);
+            //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
             struct ggml_tensor * V =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
                                 Vcur,
                                 n_state/n_head, n_head, n_ctx),
                             0, 2, 1, 3),
+                        ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
                         );
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
 #endif
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            wctx.use_buf(ctx0, 1);
+            cur = ggml_cpy(ctx0,
                     KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
         }
         // projection
         {
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_mul_mat(ctx0,
                     layer.attn_ln_1_w,
                     cur);
+            wctx.use_buf(ctx0, 1);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
                     cur);
         }
+        wctx.use_buf(ctx0, 2);
         // add the input
+        cur = ggml_add(ctx0, cur, inpL);
         struct ggml_tensor * inpFF = cur;
         {
             // norm
             {
+                wctx.use_buf(ctx0, 0);
+                cur = ggml_norm(ctx0, inpFF);
+                wctx.use_buf(ctx0, 1);
                 // cur = mlp_ln_w*cur + mlp_ln_b
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, layer.mlp_ln_w, cur),
                             cur),
+                        ggml_repeat(ctx0, layer.mlp_ln_b, cur));
             }
 #ifdef WHISPER_USE_FLASH_FF
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_flash_ff(ctx0,
+                    ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wctx.wtype, n_state, n_ctx)),
                     layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
+            wctx.use_buf(ctx0, 0);
             // fully connected
+            cur = ggml_mul_mat(ctx0,
                     layer.mlp_0_w,
                     cur);
+            wctx.use_buf(ctx0, 1);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, layer.mlp_0_b, cur),
                     cur);
+            wctx.use_buf(ctx0, 0);
             // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+            wctx.use_buf(ctx0, 1);
             // projection
+            cur = ggml_mul_mat(ctx0,
                     layer.mlp_1_w,
                     cur);
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, layer.mlp_1_b, cur),
                     cur);
 #endif
         }
+        wctx.use_buf(ctx0, 3);
+        inpL = ggml_add(ctx0, cur, inpFF);
     }
     cur = inpL;
     // norm
     {
+        wctx.use_buf(ctx0, 0);
         cur = ggml_norm(ctx0, cur);
+        wctx.use_buf(ctx0, 1);
         // cur = ln_f_g*cur + ln_f_b
         cur = ggml_add(ctx0,
                 ggml_mul(ctx0,
                 ggml_repeat(ctx0, model.e_ln_b, cur));
     }
+    wctx.use_buf(ctx0, -1);
     // run the computation
     {
         struct ggml_cgraph gf = {};
         for (int il = 0; il < model.hparams.n_text_layer; ++il) {
             auto & layer = model.layers_decoder[il];
+            wctx.use_buf(ctx0, 0);
             struct ggml_tensor * Kcross = ggml_mul_mat(ctx0,
                     layer.cross_attn_k_w,
                     cur);
             Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            wctx.use_buf(ctx0, 1);
             struct ggml_tensor * Vcross = ggml_mul_mat(ctx0,
                     layer.cross_attn_v_w,
                     cur);
                         Vcross),
                     Vcross);
+            wctx.use_buf(ctx0, -1);
             //struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_cross.k, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
             //struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_cross.v, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
             struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_cross.k, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.k)*n_state)*(il*n_ctx));
     ////////////////////////////////////////////////////////////////////////////
+    //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
+    //        ggml_used_mem(ctx0)/1024.0/1024.0,
+    //        wctx.get_buf_max_mem(0)/1024.0/1024.0,
+    //        wctx.get_buf_max_mem(1)/1024.0/1024.0,
+    //        wctx.get_buf_max_mem(2)/1024.0/1024.0,
+    //        wctx.get_buf_max_mem(3)/1024.0/1024.0);
     ggml_free(ctx0);
 // evaluate the decoder
 //
+// given text prompt + audio features -> computes the logits for the next token
 //
 //   - model:      the model
 //   - n_threads:  number of threads to use
     struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph gf = {};
+    gf.n_threads = n_threads;
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
         ((int32_t *) position->data)[i] = n_past + i;
     }
+    wctx.use_buf(ctx0, 3);
     // token encoding + position encoding
     struct ggml_tensor * cur =
         ggml_add(ctx0,
     for (int il = 0; il < n_layer; ++il) {
         const auto & layer = model.layers_decoder[il];
         // norm
         {
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_norm(ctx0, inpL);
             // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
                         cur),
+                    ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
         }
         // self-attention
         {
+            wctx.use_buf(ctx0, 1);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                     layer.attn_q_w,
                     cur);
+            Qcur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
                         layer.attn_q_b,
                         Qcur),
                     Qcur);
+            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // note: no bias for Key
+            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                     layer.attn_k_w,
                     cur);
+            Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                     layer.attn_v_w,
                     cur);
+            Vcur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
                         layer.attn_v_b,
                         Vcur),
                     Vcur);
             // store key and value to memory
             {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_state, (ggml_element_size(kv_self.v)*n_state)*(il*n_ctx + n_past));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
             }
             // ------
+            wctx.use_buf(ctx0, 0);
             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
                             Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, N)),
                         0, 2, 1, 3);
             struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.k)*n_state),
                             n_state/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
+            wctx.use_buf(ctx0, 1);
             // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            wctx.use_buf(ctx0, 0);
             //struct ggml_tensor * KQ_scaled =
+            //    ggml_scale(ctx0,
             //            KQ,
+            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
+            wctx.use_buf(ctx0, 1);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            wctx.use_buf(ctx0, 0);
             struct ggml_tensor * V_trans =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
                             n_state/n_head, n_head, n_past + N),
                         1, 2, 0, 3);
+            wctx.use_buf(ctx0, 1);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            cur = ggml_cpy(ctx0,
                     KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
         }
+        // projection
         {
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_mul_mat(ctx0,
                     layer.attn_ln_1_w,
                     cur);
+            wctx.use_buf(ctx0, 1);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
                     cur);
         }
+        wctx.use_buf(ctx0, 2);
         // add the input
+        struct ggml_tensor * inpCA = ggml_add(ctx0, cur, inpL);
         // norm
         {
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_norm(ctx0, inpCA); // note: we use inpCA here
+            wctx.use_buf(ctx0, 1);
             // cur = ln_0_w*cur + ln_0_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur),
                         cur),
+                    ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur));
         }
         // cross-attention
         {
+            wctx.use_buf(ctx0, 0);
+            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                     layer.cross_attn_q_w,
                     cur);
+            Qcur = ggml_add(ctx0,
+                    ggml_repeat(ctx0,
                         layer.cross_attn_q_b,
                         Qcur),
                     Qcur);
+            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // Kcross is already scaled
             struct ggml_tensor * Kcross =
+                ggml_reshape_3d(ctx0,
+                        ggml_view_1d(ctx0, wctx.kv_cross.k, M*n_state, il*M*ggml_element_size(wctx.kv_cross.k)*n_state),
                         n_state/n_head, n_head, M);
             struct ggml_tensor * Vcross =
+                ggml_reshape_3d(ctx0,
+                        ggml_view_1d(ctx0, wctx.kv_cross.v, M*n_state, il*M*ggml_element_size(wctx.kv_cross.v)*n_state),
                         n_state/n_head, n_head, M);
+            struct ggml_tensor * V_trans = ggml_permute(ctx0, Vcross, 1, 2, 0, 3);
             // ------
+            wctx.use_buf(ctx0, 1);
             struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
                             Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, N)),
                         0, 2, 1, 3);
+            struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
+            wctx.use_buf(ctx0, 0);
             // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
             //struct ggml_tensor * KQ_scaled =
+            //    ggml_scale(ctx0,
             //            KQ,
+            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
             // no masking for cross-attention
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            wctx.use_buf(ctx0, 1);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
+            wctx.use_buf(ctx0, 0);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            wctx.use_buf(ctx0, 1);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
             // cur = KQV_merged.contiguous().view(n_state, N)
+            cur = ggml_cpy(ctx0,
                     KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
         }
         // projection
         {
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_mul_mat(ctx0,
                     layer.cross_attn_ln_1_w,
                     cur);
+            wctx.use_buf(ctx0, 1);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
                     cur);
         }
+        wctx.use_buf(ctx0, 2);
         // add the input
+        cur = ggml_add(ctx0, cur, inpCA);
         struct ggml_tensor * inpFF = cur;
         {
             // norm
             {
+                wctx.use_buf(ctx0, 0);
+                cur = ggml_norm(ctx0, inpFF);
+                wctx.use_buf(ctx0, 1);
                 // cur = mlp_ln_w*cur + mlp_ln_b
+                cur = ggml_add(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, layer.mlp_ln_w, cur),
                             cur),
+                        ggml_repeat(ctx0, layer.mlp_ln_b, cur));
             }
+            wctx.use_buf(ctx0, 0);
             // fully connected
+            cur = ggml_mul_mat(ctx0,
                     layer.mlp_0_w,
                     cur);
+            wctx.use_buf(ctx0, 1);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, layer.mlp_0_b, cur),
                     cur);
+            wctx.use_buf(ctx0, 0);
             // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+            wctx.use_buf(ctx0, 1);
             // projection
+            cur = ggml_mul_mat(ctx0,
                     layer.mlp_1_w,
                     cur);
+            wctx.use_buf(ctx0, 0);
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, layer.mlp_1_b, cur),
+                    cur);
         }
+        wctx.use_buf(ctx0, 3);
+        inpL = ggml_add(ctx0, cur, inpFF);
     }
     cur = inpL;
     // norm
     {
+        wctx.use_buf(ctx0, 0);
         cur = ggml_norm(ctx0, cur);
+        wctx.use_buf(ctx0, 1);
         cur = ggml_add(ctx0,
                 ggml_mul(ctx0,
                     ggml_repeat(ctx0, model.d_ln_w, cur),
                 ggml_repeat(ctx0, model.d_ln_b, cur));
     }
+    wctx.use_buf(ctx0, 0);
+    // compute logits only for the last token
+    // comment this line to compute logits for all N tokens
+    // might be useful in the future
+    cur = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
     struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
+    wctx.use_buf(ctx0, -1);
     // run the computation
     {
         ggml_build_forward_expand(&gf, logits);
         ggml_graph_compute       (ctx0, &gf);
     }
+    // extract logits for all N tokens
+    //logits_out.resize(N*n_vocab);
+    //memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
+    // extract logits only for the last token
+    logits_out.resize(n_vocab);
+    memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*n_vocab);
     if (N > 1) {
+        //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
+        //        ggml_used_mem(ctx0)/1024.0/1024.0,
+        //        wctx.get_buf_max_mem(0)/1024.0/1024.0,
+        //        wctx.get_buf_max_mem(1)/1024.0/1024.0,
+        //        wctx.get_buf_max_mem(2)/1024.0/1024.0,
+        //        wctx.get_buf_max_mem(3)/1024.0/1024.0);
     }
     ggml_free(ctx0);