nvidia
/

audio-flamingo-3

@@ -73,6 +73,287 @@ Extensive evaluations confirm AF3’s effectiveness, setting new benchmarks on o
 **This model is for non-commercial research purposes only.**
 ## Results:
 <center><img src="static/af3_radial-1.png" width="400"></center>

 **This model is for non-commercial research purposes only.**
+## Usage
+Audio Flamingo 3 is supported in 🤗 Transformers. To run the model, first install Transformers:
+```bash
+pip install --upgrade pip
+pip install --upgrade git+https://github.com/huggingface/transformers
+```
+> **Note:** AF3 processes audio in 30-second windows with a **10-minute** total cap per sample. Longer inputs are truncated.
+### Single-turn: audio + text instruction
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Transcribe the input speech."},
+            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
+        ],
+    }
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+### Multi-turn chat
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Instruction: How does the tone of female speech change throughout the audio? Choose the correct option among the options below: (A) Sad to happy (B) Happy to sad (C) Neutral to happy (D) Happy to neutral.",
+            },
+            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/000000786159.31.wav"},
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [{"type": "text", "text": "(A) Sad to happy"}],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Why do you think so?"},
+        ],
+    },
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+### Batch multiple conversations
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+conversations = [
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Transcribe the input speech."},
+                {
+                    "type": "audio",
+                    "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
+                },
+            ],
+        }
+    ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
+                },
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
+            ],
+        }
+    ],
+]
+inputs = processor.apply_chat_template(
+    conversations,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+### Text-only and audio-only prompts
+```python
+# text-only
+conv = [{"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]}]
+batch = processor.apply_chat_template(conv, tokenize=True, add_generation_prompt=True, return_dict=True).to(device)
+print(processor.batch_decode(model.generate(**batch)[:, batch["input_ids"].shape[1]:], skip_special_tokens=True)[0])
+# audio-only
+conv = [{"role": "user", "content": [{"type": "audio", "path": "https://.../sample.wav"}]}]
+batch = processor.apply_chat_template(conv, tokenize=True, add_generation_prompt=True, return_dict=True).to(device)
+print(processor.batch_decode(model.generate(**batch)[:, batch["input_ids"].shape[1]:], skip_special_tokens=True)[0])
+```
+AF3 transcription checkpoints prepend answers with fixed assistant phrasing such as `The spoken content of the audio is "<text>".`. Passing `strip_prefix=True` removes that canned prefix and the surrounding quotes so you only keep the transcription.
+### Transcribe a local/remote file (shortcut)
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+inputs = processor.apply_transcription_request(audio="https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav").to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True, strip_prefix=True)
+print(decoded_outputs)
+```
+### Training / Fine-tuning
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+model.train()
+conversation = [
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Transcribe the input speech."},
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "The transcription of the audio is 'summer follows spring the days grow longer and the nights are warm'."}],
+        }
+    ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
+                },
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "The transcription of the audio is 'some transcription of the audio'."}],
+        }
+    ]
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    output_labels=True,
+).to(model.device)
+loss = model(**inputs).loss
+loss.backward()
+```
+### Generation options
+You can tune decoding similar to other text-generation models:
+```python
+generate_kwargs = {
+    "max_new_tokens": 256,
+    "do_sample": True,
+    "temperature": 0.7,
+    "top_p": 0.9,
+}
+out = model.generate(**batch, **generate_kwargs)
+```
+## Additional Speed & Memory Improvements
+### Flash Attention 2
+If your GPU supports it and you are **not** using `torch.compile`, install Flash-Attention and enable it at load time:
+```bash
+pip install flash-attn --no-build-isolation
+```
+```python
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, attn_implementation="flash_attention_2"
+).to(device)
+```
+### Torch compile
+AF3’s forward pass is compatible with `torch.compile` for significant speed-ups:
+```python
+import torch
+torch.set_float32_matmul_precision("high")
+model.generation_config.cache_implementation = "static"
+model.generation_config.max_new_tokens = 256
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+```
+> `torch.compile` is not compatible with Flash Attention 2 at the same time.
+### PyTorch SDPA
+If Flash-Attention isn’t available, AF3 will use PyTorch scaled-dot product attention (SDPA) by default on supported PyTorch versions. You can set it explicitly:
+```python
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, attn_implementation="sdpa"
+).to(device)
+```
 ## Results:
 <center><img src="static/af3_radial-1.png" width="400"></center>