Spaces:

natasa365
/

whisper.cpp

Running

ggerganov commited on Oct 22, 2022

Commit

4312a39

1 Parent(s): 363140f

Update README.md and finalize the whisper.wasm example

Files changed (7) hide show

CMakeLists.txt CHANGED Viewed

@@ -124,6 +124,7 @@ else()
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2 /D_CRT_SECURE_NO_WARNINGS=1")
     else()
         if (EMSCRIPTEN)
             set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -msimd128")
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
         else()

         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2 /D_CRT_SECURE_NO_WARNINGS=1")
     else()
         if (EMSCRIPTEN)
+            # we require support for WASM SIMD 128-bit
             set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -msimd128")
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
         else()

Makefile CHANGED Viewed

@@ -90,7 +90,7 @@ libwhisper.a: ggml.o whisper.o
 	ar rcs libwhisper.a ggml.o whisper.o
 clean:
-	rm -f *.o main libwhisper.a
 #
 # Examples

 	ar rcs libwhisper.a ggml.o whisper.o
 clean:
+	rm -f *.o main stream libwhisper.a
 #
 # Examples

README.md CHANGED Viewed

@@ -289,7 +289,7 @@ You can download the converted models using the [download-ggml-model.sh](downloa
 https://ggml.ggerganov.com
-For more details, see the conversion script [convert-pt-to-ggml.py](convert-pt-to-ggml.py) or the README in [models](models).
 ## Bindings

 https://ggml.ggerganov.com
+For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README in [models](models).
 ## Bindings

examples/whisper.wasm/README.md CHANGED Viewed

@@ -1,3 +1,27 @@
 # whisper.wasm
-Live demo: https://whisper.ggerganov.com

 # whisper.wasm
+Inference of [OpenAI's Whisper ASR model](https://github.com/openai/whisper) inside the browser
+This example uses a WebAssembly (WASM) port of the [whisper.cpp](https://github.com/ggerganov/whisper.cpp)
+implementation of the transformer to run the inference inside a web page. The audio data does not leave your computer -
+it is processed locally on your machine. The performance is not great but you should be able to achieve x2 or x3
+real-time for the `tiny` and `base` models on a modern CPU and browser (i.e. transcribe a 60 seconds audio in about
+~20-30 seconds).
+This WASM port utilizes [WASM SIMD 128-bit intrinsics](https://emcc.zcopy.site/docs/porting/simd/) so you have to make
+sure that [your browser supports them](https://webassembly.org/roadmap/).
+The example is capable of running all models up to size `small` inclusive. Beyond that, the memory requirements and
+performance are unsatisfactory. The implementation currently support only the `Greedy` sampling strategy. Both
+transcription and translation are supported.
+Since the model data is quite big (74MB for the `tiny` model) you need to manually load the model into the web-page.
+The example supports both loading audio from a file and recording audio from the microphone. The maximum length of the
+audio is limited to 120 seconds.
+## Live demo
+Link: https://whisper.ggerganov.com
+![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png)

examples/whisper.wasm/index-tmpl.html CHANGED Viewed

@@ -162,7 +162,7 @@
                 </tr>
             </table>
-            <br><br>
             <!-- textarea with height filling the rest of the page -->
             <textarea id="output" rows="20"></textarea>
@@ -254,6 +254,10 @@
                 return new type(buffer);
             }
             function loadFile(event, fname) {
                 var file = event.target.files[0] || null;
                 if (file == null) {
@@ -281,6 +285,10 @@
                 reader.readAsArrayBuffer(file);
             }
             function loadAudio(event) {
                 if (!context) {
                     context = new AudioContext({sampleRate: 16000});
@@ -327,7 +335,7 @@
             }
             //
-            // Microphone
             //
             var mediaRecorder = null;

                 </tr>
             </table>
+            <br>
             <!-- textarea with height filling the rest of the page -->
             <textarea id="output" rows="20"></textarea>
                 return new type(buffer);
             }
+            //
+            // load model
+            //
             function loadFile(event, fname) {
                 var file = event.target.files[0] || null;
                 if (file == null) {
                 reader.readAsArrayBuffer(file);
             }
+            //
+            // audio file
+            //
             function loadAudio(event) {
                 if (!context) {
                     context = new AudioContext({sampleRate: 16000});
             }
             //
+            // microphone
             //
             var mediaRecorder = null;

extra/convert-all.sh CHANGED Viewed

@@ -3,6 +3,6 @@
 models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
 for model in "${models[@]}"; do
-    python3 convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
     mv -v models/ggml-model.bin models/ggml-$model.bin
 done

 models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
 for model in "${models[@]}"; do
+    python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
     mv -v models/ggml-model.bin models/ggml-$model.bin
 done

convert-pt-to-ggml.py → models/convert-pt-to-ggml.py RENAMED Viewed

File without changes