# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Quick test script for video inference with NVIDIA Nemotron Nano VL model.

Note: This script requires pre-extracted video frames. Use ffmpeg or similar tools
to extract frames from your video first:
    ffmpeg -i video.mp4 -vf fps=1 frames/frame_%04d.jpg
"""

import argparse

import torch
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer

import video_io


def load_model(model_path: str, device: str = "cuda:0"):
    """Load the VLM model and processor.
    
    Args:
        model_path: Path to the pretrained model
        device: Device to load the model on
        
    Returns:
        Tuple of (model, tokenizer, processor)
    """
    print(f"Loading model from {model_path}...")
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        trust_remote_code=True,
        device_map=device,
        torch_dtype=torch.bfloat16
    ).eval()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
    print("Model loaded successfully!")
    return model, tokenizer, processor


def test_video_from_frames(
    model,
    tokenizer,
    processor,
    frames_dir: str,
    video_fps: int = 1,
    prompt_text: str = "Describe what you see.",
    device: str = "cuda:0",
    max_new_tokens: int = 128,
    video_pruning_rate: float = 0.75,
):
    """Test model inference on video frames from a directory.
    
    Args:
        model: The VLM model
        tokenizer: The tokenizer
        processor: The processor
        frames_dir: Directory containing extracted video frames
        video_fps: FPS used when extracting frames
        prompt_text: Text prompt for the model
        device: Device to run inference on
        max_new_tokens: Maximum number of tokens to generate
        video_pruning_rate: Video pruning rate for efficient inference
    """
    print(f"\nProcessing video frames from: {frames_dir}")
    
    # Load frames from directory
    frames = video_io.load_frames_from_directory(frames_dir)
    
    # Get data URLs and metadata
    image_urls, metadata = video_io.frames_to_data_urls_with_metadata(frames, video_fps)
    
    print(f"Loaded {len(frames)} frames")
    print(f"Metadata: {metadata}")
    
    # Prepare messages
    messages = [
        {"role": "system", "content": "/no_think"},
        {
            "role": "user",
            "content": [
                {"type": "video", "video": ""},
                {"type": "text", "text": f"\n{prompt_text}"},
            ],
        }
    ]
    
    # Generate prompt
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Process with FPS metadata
    if metadata:
        inputs = processor(
            text=[prompt],
            videos=frames,
            videos_kwargs={'video_metadata': metadata},
            return_tensors="pt",
        )
    else:
        inputs = processor(
            text=[prompt],
            videos=frames,
            return_tensors="pt",
        )
    inputs = inputs.to(device)
    
    # Set video pruning rate for efficient inference
    model.video_pruning_rate = video_pruning_rate
    
    # Generate output
    generated_ids = model.generate(
        pixel_values_videos=inputs.pixel_values_videos,
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,
    )
    
    # Decode output
    output_text = processor.batch_decode(
        generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
    )[0]
    
    print(f"Output: {output_text}\n")


def main():
    parser = argparse.ArgumentParser(
        description="Test video inference with VLM model using pre-extracted frames",
        epilog="Example: Extract frames with ffmpeg first: "
               "ffmpeg -i video.mp4 -vf fps=1 frames/frame_%%04d.jpg"
    )
    parser.add_argument(
        "--model_path",
        type=str,
        required=True,
        help="Path to the pretrained model"
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda:0",
        help="Device to run inference on (e.g., cuda:0, cpu)"
    )
    parser.add_argument(
        "--frames_dir",
        type=str,
        default="images/demo_frames",
        help="Directory containing extracted video frames"
    )
    parser.add_argument(
        "--video_fps",
        type=int,
        default=1,
        help="FPS used when extracting frames (for temporal understanding)"
    )
    parser.add_argument(
        "--prompt",
        type=str,
        default="Describe what you see.",
        help="Text prompt for the model"
    )
    parser.add_argument(
        "--max_new_tokens",
        type=int,
        default=128,
        help="Maximum number of tokens to generate"
    )
    parser.add_argument(
        "--video_pruning_rate",
        type=float,
        default=0.75,
        help="Video pruning rate for efficient inference (0.0-1.0)"
    )
    args = parser.parse_args()
    
    # Load model
    model, tokenizer, processor = load_model(args.model_path, args.device)
    
    # Test video inference from frames
    print("=" * 50)
    print("Testing Video Inference from Frames")
    print("=" * 50)
    
    test_video_from_frames(
        model, tokenizer, processor,
        frames_dir=args.frames_dir,
        video_fps=args.video_fps,
        prompt_text=args.prompt,
        device=args.device,
        max_new_tokens=args.max_new_tokens,
        video_pruning_rate=args.video_pruning_rate,
    )


if __name__ == "__main__":
    main()