# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Quick test script for video inference with NVIDIA Nemotron Nano VL model. Note: This script requires pre-extracted video frames. Use ffmpeg or similar tools to extract frames from your video first: ffmpeg -i video.mp4 -vf fps=1 frames/frame_%04d.jpg """ import argparse import torch from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer import video_io def load_model(model_path: str, device: str = "cuda:0"): """Load the VLM model and processor. Args: model_path: Path to the pretrained model device: Device to load the model on Returns: Tuple of (model, tokenizer, processor) """ print(f"Loading model from {model_path}...") model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, device_map=device, torch_dtype=torch.bfloat16 ).eval() tokenizer = AutoTokenizer.from_pretrained(model_path) processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) print("Model loaded successfully!") return model, tokenizer, processor def test_video_from_frames( model, tokenizer, processor, frames_dir: str, video_fps: int = 1, prompt_text: str = "Describe what you see.", device: str = "cuda:0", max_new_tokens: int = 128, video_pruning_rate: float = 0.75, ): """Test model inference on video frames from a directory. Args: model: The VLM model tokenizer: The tokenizer processor: The processor frames_dir: Directory containing extracted video frames video_fps: FPS used when extracting frames prompt_text: Text prompt for the model device: Device to run inference on max_new_tokens: Maximum number of tokens to generate video_pruning_rate: Video pruning rate for efficient inference """ print(f"\nProcessing video frames from: {frames_dir}") # Load frames from directory frames = video_io.load_frames_from_directory(frames_dir) # Get data URLs and metadata image_urls, metadata = video_io.frames_to_data_urls_with_metadata(frames, video_fps) print(f"Loaded {len(frames)} frames") print(f"Metadata: {metadata}") # Prepare messages messages = [ {"role": "system", "content": "/no_think"}, { "role": "user", "content": [ {"type": "video", "video": ""}, {"type": "text", "text": f"\n{prompt_text}"}, ], } ] # Generate prompt prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Process with FPS metadata if metadata: inputs = processor( text=[prompt], videos=frames, videos_kwargs={'video_metadata': metadata}, return_tensors="pt", ) else: inputs = processor( text=[prompt], videos=frames, return_tensors="pt", ) inputs = inputs.to(device) # Set video pruning rate for efficient inference model.video_pruning_rate = video_pruning_rate # Generate output generated_ids = model.generate( pixel_values_videos=inputs.pixel_values_videos, input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=max_new_tokens, ) # Decode output output_text = processor.batch_decode( generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False )[0] print(f"Output: {output_text}\n") def main(): parser = argparse.ArgumentParser( description="Test video inference with VLM model using pre-extracted frames", epilog="Example: Extract frames with ffmpeg first: " "ffmpeg -i video.mp4 -vf fps=1 frames/frame_%%04d.jpg" ) parser.add_argument( "--model_path", type=str, required=True, help="Path to the pretrained model" ) parser.add_argument( "--device", type=str, default="cuda:0", help="Device to run inference on (e.g., cuda:0, cpu)" ) parser.add_argument( "--frames_dir", type=str, default="images/demo_frames", help="Directory containing extracted video frames" ) parser.add_argument( "--video_fps", type=int, default=1, help="FPS used when extracting frames (for temporal understanding)" ) parser.add_argument( "--prompt", type=str, default="Describe what you see.", help="Text prompt for the model" ) parser.add_argument( "--max_new_tokens", type=int, default=128, help="Maximum number of tokens to generate" ) parser.add_argument( "--video_pruning_rate", type=float, default=0.75, help="Video pruning rate for efficient inference (0.0-1.0)" ) args = parser.parse_args() # Load model model, tokenizer, processor = load_model(args.model_path, args.device) # Test video inference from frames print("=" * 50) print("Testing Video Inference from Frames") print("=" * 50) test_video_from_frames( model, tokenizer, processor, frames_dir=args.frames_dir, video_fps=args.video_fps, prompt_text=args.prompt, device=args.device, max_new_tokens=args.max_new_tokens, video_pruning_rate=args.video_pruning_rate, ) if __name__ == "__main__": main()