File size: 51,415 Bytes

# Copyright 2025 Jina AI. All rights reserved.

from abc import ABCMeta, abstractmethod
from copy import deepcopy
from functools import wraps
from math import prod, sqrt
from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union

import einops
import torch
import torch.backends.cuda
import torch.nn as nn
import torch.nn.functional as f
from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import PretrainedConfig
from transformers.activations import ACT2FN
from transformers.cache_utils import Cache
from transformers.integrations import use_kernel_forward_from_hub
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_layers import GradientCheckpointingLayer
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from transformers.processing_utils import Unpack

from .configuration_jvlm import (
    ImagePaddingEmbedType,
    ImagePooling2DType,
    ImageProjectionType,
    JinaAttentionConfig,
    JinaFFNConfig,
    JinaLNormConfig,
    JinaTransformerBlockConfig,
    JinaVLConnectorConfig,
    LayerNormType,
)


class Dropout(nn.Dropout):
    def __init__(
        self,
        p: float = 0.5,
        inplace: bool = False,
        mask_p: float = 0.0,
        broadcast_dims: Sequence[int] = (),
    ) -> None:
        super().__init__(p, inplace)
        self.mask_p = mask_p
        self.broadcast_dims = broadcast_dims

    def forward(
        self, _input: torch.Tensor, drop_mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        :param _input: A tensor of shape `(batch_size, seq_len, embed_dim)`
        :param drop_mask: A tensor of shape `(batch_size, seq_len)` with values of zero
            or one
        """
        if self.p == 0.0 and (self.mask_p is None or self.mask_p == 0.0):
            return _input
        else:
            if self.mask_p > 0.0 and self.training:
                assert drop_mask is not None
                drop_mask = drop_mask.to(_input.dtype)
                keep_prob = 1.0 - self.p
                keep_prob2 = 1.0 - self.mask_p
                keep_prob = drop_mask * keep_prob2 + (1 - drop_mask) * keep_prob
                keep_prob = keep_prob.unsqueeze(-1)
                dropout_shape = list(_input.shape)
                keep_prob = keep_prob.broadcast_to(dropout_shape)
                multiplier = _input.new_empty(dropout_shape).bernoulli_(keep_prob)
                multiplier.div_(keep_prob)
                return _input * multiplier
            elif self.p > 0.0 and len(self.broadcast_dims) > 0 and self.training:
                keep_prob = 1.0 - self.p
                dropout_shape = list(_input.shape)
                for dim in self.broadcast_dims:
                    dropout_shape[dim] = 1
                keep = _input.new_empty(dropout_shape).bernoulli_(keep_prob)
                multiplier = keep.broadcast_to(_input.shape)
                multiplier.div_(keep_prob)
                return _input * multiplier
            else:
                return f.dropout(_input, self.p, self.training, self.inplace)


class ResidualPathDropout(nn.Module):
    """Drops paths (Stochastic Depth) per sample (when applied in main path of residual
    blocks).

    Taken from
    https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
    """

    def __init__(self, p: float = 0.5, scale_by_keep: bool = True) -> None:
        super(ResidualPathDropout, self).__init__()
        assert 0 <= p < 1.0
        self.p = p
        self.scale_by_keep = scale_by_keep

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Drop paths (Stochastic Depth) per sample (when applied in main path of
        residual blocks).

        This is the same as the DropConnect impl I created for EfficientNet, etc
        networks, however, the original name is misleading as 'Drop Connect' is a
        different form of dropout in a separate paper...
        See discussion:
        https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956

        I've opted for changing the layer and argument names to 'drop path' rather
        than mix DropConnect as a layer name and use 'survival rate' as the argument.
        """
        if self.p == 0.0 or not self.training:
            return x
        keep_prob = 1 - self.p
        # work with diff dim tensors, not just 2D ConvNets
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
        if keep_prob > 0.0 and self.scale_by_keep:
            random_tensor.div_(keep_prob)
        return x * random_tensor


class PatchDropout(nn.Module):
    """
    https://arxiv.org/abs/2212.00794
    """

    def __init__(self, p: float = 0.5, exclude_first_token: bool = True):
        super().__init__()
        assert 0 <= p < 1.0
        self.p = p
        self.exclude_first_token = exclude_first_token  # exclude CLS token

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        if not self.training or self.p == 0.0:
            return x, None

        if self.exclude_first_token:
            _cls_tokens, x = x[:, :1], x[:, 1:]
        else:
            _cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])

        batch, ntokens = x.size()
        batch_indices = torch.arange(batch)
        batch_indices = batch_indices[..., None]

        keep_prob = 1 - self.p
        num_patches_keep = max(1, int(ntokens * keep_prob))
        rand = torch.randn(batch, ntokens)
        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices

        x = x[batch_indices, patch_indices_keep]
        if self.exclude_first_token:
            x = torch.cat((_cls_tokens, x), dim=1)

        return x, patch_indices_keep


"""
Embedding layers. Adapted from AllenAI Molmo 
https://github.com/allenai/molmo
"""


class ExtendedEmbedding(nn.Module):
    def __init__(
        self,
        num_embeddings: int,
        num_new_embeddings: int,
        num_features: int,
    ):
        super().__init__()
        self.embedding = nn.Parameter(
            torch.zeros(num_embeddings, num_features),
        )
        self.new_embedding = nn.Parameter(
            torch.zeros(num_new_embeddings, num_features),
        )

    @property
    def weight(self):
        return self.embedding

    @weight.setter
    def weight(self, w):
        self.embedding = w

    @property
    def embedding_table(self) -> torch.Tensor:
        return torch.cat([self.embedding, self.new_embedding], dim=0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return f.embedding(x, self.embedding_table)


class PatchEmbedding(nn.Module):
    def __init__(
        self,
        dim: int = 768,
        patch_size: int = 16,
        num_channels: int = 3,
        input_size: Optional[Tuple[int, int]] = None,
        bias: bool = True,
        use_linear: bool = False,
    ):
        super().__init__()
        self._input_size = input_size
        self._patch_size = (patch_size, patch_size)
        if input_size is not None:
            self._patch_shape = (
                self._input_size[0] // self._patch_size[0],
                self._input_size[1] // self._patch_size[1],
            )
            self._num_patches = prod(self._patch_shape)
        else:
            assert not use_linear, 'Linear patch embedding requires a fixed input size!'
            self._patch_shape = None
            self._num_patches = None
        self._num_channels = num_channels
        self._dim = dim
        self._bias = bias
        if use_linear:
            self.proj = nn.Linear(
                self._num_channels * self._patch_size[0] * self._patch_size[1],
                self._dim,
                bias=self._bias,
            )
            self._proj_impl = 'linear'
        else:
            self.proj = nn.Conv2d(
                self._num_channels,
                self._dim,
                kernel_size=self._patch_size,
                stride=self._patch_size,
                bias=self._bias,
                # padding='valid',
            )
            self._proj_impl = 'conv2d'

    def _linear_pre_projection(self, x: torch.Tensor) -> torch.Tensor:
        b, c, *_ = x.shape
        p1, p2 = self._patch_size
        patches = x.unfold(2, p1, p1).unfold(3, p2, p2)
        patches = patches.permute(0, 2, 3, 4, 5, 1)
        return patches.reshape(b, -1, c * p1 * p2)

    @staticmethod
    def _conv2d_pre_projection(x: torch.Tensor) -> torch.Tensor:
        return x

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Tuple[int, int]]:
        # shape: (batch_size, n_channels, height, width)
        if len(x.shape) == 4:
            bs, ch, h, w = x.shape
            p1, p2 = self._patch_size
            assert ch == self._num_channels, (
                f'Input tensor has {ch} channels, but model expects '
                f'{self._num_channels} channels'
            )
            if self._input_size is not None:
                assert (h, w) == self._input_size, (
                    f"Input image shape {(h, w)} doesn't match model's "
                    f'{self._input_size}'
                )
                if self._proj_impl == 'linear':
                    patches = x.unfold(2, p1, p1).unfold(3, p2, p2)
                    patches = patches.permute(0, 2, 3, 4, 5, 1)
                    x = patches.reshape(bs, -1, ch * p1 * p2)
            else:
                assert h % p1 == 0 and w % p2 == 0, (
                    f'Input image shape {(h, w)} is not divisible by patch size '
                    f'{self._patch_size}'
                )
            shape = (h // p1, w // p2)

        # shape: (batch_size, seq_len, n_pixels)
        elif len(x.shape) == 3:
            bs, sl, np = x.shape
            h = int(sqrt(sl))
            shape = (h, h)
            if self._input_size is not None:
                assert self._num_patches == sl, (
                    f"Input sequence length ({sl}) doesn't match model's patch shape "
                    f'({self._patch_shape})'
                )
            else:
                assert h * h == sl, (
                    f'Input sequence length {sl} is not a perfect square. Please '
                    f'provide a square sequence length, from which the shape can be '
                    f'inferred. For non-square inputs, use a 4D tensor with shape '
                    f'(batch_size, n_channels, height, width)'
                )
            p1, p2 = self._patch_size
            assert np == self._num_channels * p1 * p2, (
                f'The input number of pixels ({np}) does not match the expected number '
                f'n_channels * patch_size_horizontal * patch_size_vertical '
                f'({self._num_channels * p1 * p2})'
            )
            if self._proj_impl == 'conv2d':
                # Reshape to 4D tensor for Conv2d projection
                x = (
                    x.unfold(1, h, h)
                    .reshape(bs, h, h, p1, p2, self._num_channels)
                    .permute(0, 5, 1, 3, 2, 4)
                    .reshape(bs, self._num_channels, h * p1, h * p2)
                )
        else:
            raise ValueError(
                f'Input tensor must be 3D or 4D, got {len(x.shape)}D tensor with shape '
                f'{x.shape}. Accepted shapes are (batch_size, n_channels, height, '
                f'width) or (batch_size, seq_len, n_pixels)'
            )
        out = self.proj(x.to(dtype=self.proj.weight.dtype))
        if self._proj_impl == 'conv2d':
            out = out.flatten(2).permute(0, 2, 1)
        return out, shape


"""
Rotary Positional Embeddings. Compatible with HuggingFace transformers 
https://github.com/huggingface/transformers/blob/main/src/transformers/
modeling_rope_utils.py
"""


def inv_freq_to_device(rope_forward):
    """Sometimes the inv_freq is calculated on the wrong device, or ends up in lower
    precision than float32.

    This wrapper ensures that inv_freq is always on the right device and in float32
    precision.
    """

    @wraps(rope_forward)
    def wrapper(self, x, position_ids):
        if self.inv_freq.dtype != torch.float32 or self.rope_init_device != x.device:
            invfreq, self.attention_scaling = self.rope_init_fn(
                self.config, x.device, self.max_seq_len_cached
            )
            self.register_buffer('inv_freq', invfreq, persistent=False)
            self.original_inv_freq = self.inv_freq
            self.rope_init_device = x.device
        return rope_forward(self, x, position_ids)

    return wrapper


class RotaryEmbedding(nn.Module):
    inv_freq: torch.Tensor

    def __init__(
        self,
        config: PretrainedConfig,
        theta: float,
        head_dim: int,
        hidden_size: int,
        partial_rotary_factor: float,
        device: Optional[torch.device] = None,
        scaling: Optional[Dict[str, Any]] = None,
    ):
        super().__init__()
        assert hasattr(config, 'rope_theta')
        self.config = deepcopy(config)

        # NOTE: for HF RoPE interface compatibility
        setattr(self.config, 'rope_theta', theta)
        setattr(self.config, 'partial_rotary_factor', partial_rotary_factor)
        setattr(self.config, 'head_dim', head_dim)
        setattr(self.config, 'hidden_size', hidden_size)
        setattr(self.config, 'rope_scaling', scaling or {})

        self.rope_type = 'default'
        if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
            self.rope_type = config.rope_scaling.get('rope_type', 'default')

        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
        device = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        seqlen = config.max_position_embeddings or config.max_sequence_length
        invfreq, self.attention_scaling = self.rope_init_fn(self.config, device, seqlen)
        self.rope_init_device = device
        self.register_buffer('inv_freq', invfreq, persistent=False)
        self.original_inv_freq = self.inv_freq
        self.max_seq_len_cached = seqlen
        self.original_max_seq_len = self.max_seq_len_cached

    @torch.no_grad()
    @inv_freq_to_device
    @dynamic_rope_update
    def forward(self, x: torch.Tensor, position_ids: torch.Tensor):
        device_type = (
            x.device.type
            if isinstance(x.device.type, str) and x.device.type != 'mps'
            else 'cpu'
        )
        with torch.autocast(device_type=device_type, enabled=False):
            inv_freq_expanded = self.inv_freq[None, :, None].expand(
                position_ids.shape[0], -1, 1
            )
            position_ids_expanded = position_ids[:, None, :].float()
            freqs = inv_freq_expanded * position_ids_expanded
            freqs = freqs.transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
            sin = emb.sin() * self.attention_scaling

        return cos, sin


"""
Residual wrapper. Adapted from AllenAI Molmo 
https://github.com/allenai/molmo
"""


class Residual(nn.Module):
    def __init__(self, submodule: nn.Module):
        super().__init__()
        self.submodule = submodule

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.submodule(x)


"""
Layer scaling. Adapted from 
https://github.com/facebookresearch/dinov2/blob/main/dinov2/layers/layer_scale.py
"""


class LayerScale(nn.Module):
    """
    LayerScale appearing in DINO v2
    From
    https://github.com/facebookresearch/dinov2/blob/main/dinov2/layers/layer_scale.py
    """

    def __init__(
        self,
        dim: int,
        init_value: float = 1e-5,
        inplace: bool = False,
    ) -> None:
        super().__init__()
        self.init_value = init_value
        self.inplace = inplace
        self.gamma = nn.Parameter(init_value * torch.ones((dim,)), requires_grad=True)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x.mul_(self.gamma) if self.inplace else x * self.gamma


"""
Layer normalization. Adapted from AllenAI Molmo 
https://github.com/allenai/molmo
"""


class _LayerNorm(nn.Module, metaclass=ABCMeta):
    def __init__(
        self,
        config: JinaLNormConfig,
        size: int,
        elementwise_affine: Optional[bool] = True,
        eps: float = 1e-05,
        weight_initializer: Optional[Callable] = torch.ones,
        bias_initializer: Optional[Callable] = torch.zeros,
    ):
        super().__init__()
        self.config = config
        self.eps = self.config.eps or eps
        self.normalized_shape = (size,)
        if elementwise_affine or (
            elementwise_affine is None and self.config.with_affine
        ):
            self.weight = nn.Parameter(weight_initializer(self.normalized_shape))
            use_bias = self.config.bias
            if use_bias:
                self.bias = nn.Parameter(bias_initializer(self.normalized_shape))
            else:
                self.register_parameter('bias', None)
        else:
            self.register_parameter('bias', None)
            self.register_parameter('weight', None)

    @abstractmethod
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError

    @staticmethod
    def _cast_if_autocast_enabled(
        tensor: torch.Tensor, dtype: Optional[torch.dtype] = None
    ) -> torch.Tensor:
        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the
        # separate function `is_autocast_cpu_enabled()` for CPU autocast.
        # See https://github.com/pytorch/pytorch/issues/110966.
        if tensor.device.type == 'cuda' and torch.is_autocast_enabled():
            return tensor.to(
                dtype=dtype if dtype is not None else torch.get_autocast_gpu_dtype()
            )
        elif tensor.device.type == 'cpu' and torch.is_autocast_cpu_enabled():
            return tensor.to(
                dtype=dtype if dtype is not None else torch.get_autocast_cpu_dtype()
            )
        else:
            return tensor


class LayerNorm(_LayerNorm):
    """The default :class:`LayerNorm` implementation which can optionally run in low
    precision."""

    def __init__(
        self,
        config: JinaLNormConfig,
        size: int,
        low_precision: bool = False,
        elementwise_affine: Optional[bool] = None,
        eps: float = 1e-05,
    ):
        super().__init__(
            config, size=size, elementwise_affine=elementwise_affine, eps=eps
        )
        self.low_precision = low_precision

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.low_precision:
            module_device = x.device
            downcast_x = self._cast_if_autocast_enabled(x)
            downcast_weight = (
                self._cast_if_autocast_enabled(self.weight)
                if self.weight is not None
                else self.weight
            )
            downcast_bias = (
                self._cast_if_autocast_enabled(self.bias)
                if self.bias is not None
                else self.bias
            )
            with torch.autocast(enabled=False, device_type=module_device.type):
                return f.layer_norm(
                    downcast_x,
                    self.normalized_shape,
                    weight=downcast_weight,
                    bias=downcast_bias,
                    eps=self.eps,
                )
        else:
            return f.layer_norm(
                x,
                self.normalized_shape,
                weight=self.weight,
                bias=self.bias,
                eps=self.eps,
            )


@use_kernel_forward_from_hub('RMSNorm')
class RMSLayerNorm(_LayerNorm):
    """RMS layer norm, a simplified :class:`LayerNorm` implementation."""

    def __init__(
        self,
        config: JinaLNormConfig,
        size: int,
        elementwise_affine: Optional[bool] = None,
        eps: float = 1e-5,
    ):
        super().__init__(
            config, size=size, elementwise_affine=elementwise_affine, eps=eps
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        with torch.autocast(enabled=False, device_type=x.device.type):
            og_dtype = x.dtype
            x = x.to(torch.float32)
            variance = x.pow(2).mean(-1, keepdim=True)
            x = x * torch.rsqrt(variance + self.eps)
            x = x.to(og_dtype)

        if self.weight is not None:
            if self.bias is not None:
                return self.weight * x + self.bias
            else:
                return self.weight * x
        else:
            return x


def build_layer_norm(config: JinaLNormConfig, size: int, **kwargs) -> _LayerNorm:
    if config.type == LayerNormType.default:
        return LayerNorm(config, size=size, low_precision=False, **kwargs)
    elif config.type == LayerNormType.low_precision:
        return LayerNorm(config, size=size, low_precision=True, **kwargs)
    return RMSLayerNorm(config, size=size, **kwargs)


"""
Multi Head Scaled Dot Product Attention module and utilities. Adapted from AllenAI Molmo
https://github.com/allenai/molmo
"""


def _create_causal_mask(seq_len: int, device: torch.device) -> torch.Tensor:
    with torch.autocast(device.type, enabled=False):
        causal_mask = torch.triu(
            torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
            diagonal=1,
        )
        causal_mask.masked_fill_(causal_mask == 1, torch.finfo(causal_mask.dtype).min)
        causal_mask = causal_mask.view(1, 1, seq_len, seq_len)  # type: ignore
    return causal_mask


def _ensure_finite(
    x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False
):
    """Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the
    dtype when ``check_neg_inf`` is ``True`` and replace ``float("inf")`` with the
    maximum value of the dtype when ``check_pos_inf`` is ``True``"""
    if check_neg_inf:
        x.masked_fill_(x == float('-inf'), torch.finfo(x.dtype).min)
    if check_pos_inf:
        x.masked_fill_(x == float('inf'), torch.finfo(x.dtype).max)


def resolve_causal_mask(
    attention_mask: Optional[torch.Tensor],
    causal_mask: Optional[torch.Tensor],
    past_key_values: Optional[Cache],
    batch_size: int,
    seq_len: int,
    past_length: int,
    device,
):
    if attention_mask is not None:
        # shape: (batch_size, 1, 1, seq_len)
        if len(attention_mask.shape) == 2:
            attention_mask = attention_mask[:, : past_length + seq_len]
            attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[
                :, None, None, :
            ]
        else:
            attention_mask = attention_mask.unsqueeze(1).to(dtype=torch.float)
        attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min

    # Merge attention mask with causal mask (attention bias)
    # NOTE: We need to initialize the attn bias in order for attn to
    # work properly with key+value cache. Otherwise
    # `f.scaled_dot_product_attention()` doesn't seem to compute scores correctly
    if (
        causal_mask is not None
        or attention_mask is not None
        or past_key_values is not None
    ):
        if causal_mask is None:
            causal_mask = _create_causal_mask(past_length + seq_len, device)
        elif causal_mask.dtype in (torch.int8, torch.bool):
            causal_mask = causal_mask.to(dtype=torch.float)
            causal_mask.masked_fill_(
                causal_mask == 0.0, torch.finfo(causal_mask.dtype).min
            )
        mask_len = seq_len
        if attention_mask is not None:
            mask_len = attention_mask.shape[-1]
        elif past_key_values is not None:
            mask_len = past_length + seq_len
        causal_mask = causal_mask[:, :, :mask_len, :mask_len].to(dtype=torch.float)
        # Add in the masking bias
        if attention_mask is not None:
            causal_mask = causal_mask + attention_mask
            # Might get -infs after adding attention mask, since
            # dtype.min + dtype.min = -inf. `f.scaled_dot_product_attention()`
            # doesn't handle -inf like you'd expect, instead it can produce NaNs
            _ensure_finite(causal_mask, check_neg_inf=True, check_pos_inf=False)
    return causal_mask


def cast_attention_mask(bias: torch.Tensor, input_dtype: torch.dtype) -> torch.Tensor:
    target_dtype = input_dtype
    # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the
    # separate function `is_autocast_cpu_enabled()` for CPU autocast.
    # See https://github.com/pytorch/pytorch/issues/110966.
    if bias.device.type == 'cuda' and torch.is_autocast_enabled():
        target_dtype = torch.get_autocast_gpu_dtype()
    elif bias.device.type == 'cpu' and torch.is_autocast_cpu_enabled():
        target_dtype = torch.get_autocast_cpu_dtype()
    if bias.dtype != target_dtype:
        bias = bias.to(target_dtype)
        _ensure_finite(bias, check_neg_inf=True, check_pos_inf=False)
    return bias


def repeat_kv(hidden_states: torch.Tensor, n: int) -> torch.Tensor:
    batch, kvheads, slen, hdim = hidden_states.shape
    if n == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, kvheads, n, slen, hdim
    )
    return hidden_states.reshape(batch, kvheads * n, slen, hdim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **_,
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    weights = torch.matmul(query * scaling, key_states.transpose(2, 3))
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        weights = weights + causal_mask

    weights = f.softmax(weights, dim=-1, dtype=torch.float32).to(query.dtype)
    weights = f.dropout(weights, p=dropout, training=module.training).to(
        value_states.dtype
    )
    out = torch.matmul(weights, value_states).to(query.dtype)
    out = out.transpose(1, 2).contiguous()

    return out, weights


def rotate_half(x: torch.Tensor):
    b, nh, t, hs = x.size()
    x = x.view(b, nh, t, 2, hs // 2)
    x1, x2 = x.unbind(dim=-2)
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_positional_embeddings(
    x: torch.Tensor,
    cos: torch.Tensor,
    sin: torch.Tensor,
) -> torch.Tensor:
    return (x * cos + rotate_half(x) * sin).to(x.dtype)


def apply_rope_to_qk(q, k, cos, sin):
    q_, k_ = q.float(), k.float()
    with torch.autocast(q.device.type, enabled=False):
        q_ = apply_rotary_positional_embeddings(q_, cos, sin)
        k_ = apply_rotary_positional_embeddings(k_, cos, sin)
    q = q_.type_as(q)
    k = k_.type_as(k)
    return q, k


class MHSDPA(nn.Module):
    """Multi Head Scaled Dot Product Attention."""

    def __init__(
        self,
        config: JinaAttentionConfig,
        hidden_size: int,
        output_size: Optional[int] = None,
        self_attn: bool = True,
        is_causal: bool = False,
        layer_idx: int = 0,
        attn_implementation: Optional[str] = None,
    ):
        super().__init__()
        self.config = config

        self.hidden_size = hidden_size
        self.n_heads = config.n_heads
        self.n_kv_heads = config.n_kv_heads or self.n_heads
        self.n_kv_groups = self.n_heads // self.n_kv_heads
        self.output_size = output_size or self.hidden_size

        # NOTE: for HF attention interface compatibility
        self.num_key_value_groups = self.n_kv_groups
        self.is_causal = is_causal
        self.layer_idx = layer_idx
        self.sliding_window = config.sliding_window

        head_dim = config.head_dim
        if head_dim is None:
            assert self.hidden_size % self.n_heads == 0
            head_dim = self.hidden_size // self.n_heads
        self.head_dim = head_dim

        self.scale = config.softmax_scale or self.head_dim**-0.5
        self.scaling = self.scale

        # Make sure QKV clip coefficient is positive, otherwise it's not well-defined
        if config.clip_qkv is not None:
            assert config.clip_qkv > 0
        self.clip_qkv = config.clip_qkv

        self.fp32_attn = config.fp32
        self.self_attn = self_attn
        self.fused_dims = (
            self.n_heads * self.head_dim,
            self.n_kv_heads * self.head_dim,
            self.n_kv_heads * self.head_dim,
        )
        if self.self_attn:
            self.qkv_w = nn.Linear(self.hidden_size, sum(self.fused_dims), bias=False)
        else:
            self.q_w = nn.Linear(
                self.hidden_size,
                self.n_heads * self.head_dim,
                bias=False,
            )
            self.kv_w = nn.Linear(
                self.hidden_size,
                sum(self.fused_dims) - self.n_heads * self.head_dim,
                bias=False,
            )
        self.out = nn.Linear(
            self.n_heads * self.head_dim,
            self.output_size,
            bias=config.o_bias,
        )
        self.q_b = nn.Parameter(
            torch.zeros(self.n_heads * self.head_dim),
            requires_grad=config.q_bias,
        )
        self.k_b = nn.Parameter(
            torch.zeros(self.n_kv_heads * self.head_dim),
            requires_grad=config.k_bias,
        )
        self.v_b = nn.Parameter(
            torch.zeros(self.n_kv_heads * self.head_dim),
            requires_grad=config.v_bias,
        )
        self.q_lnorm = nn.Identity()
        self.k_lnorm = nn.Identity()
        self.v_lnorm = nn.Identity()
        self.inner_lnorm = nn.Identity()
        self.add_q_lnorm = config.q_lnorm
        self.add_k_lnorm = config.k_lnorm
        self.add_v_lnorm = config.v_lnorm
        self.qkv_lnorm_on_heads = config.qkv_lnorm_on_heads
        q_lnorm_size = (
            self.head_dim if self.qkv_lnorm_on_heads else self.n_heads * self.head_dim
        )
        kv_lnorm_size = (
            self.head_dim
            if self.qkv_lnorm_on_heads
            else self.n_kv_heads * self.head_dim
        )
        if self.add_q_lnorm:
            self.q_lnorm = build_layer_norm(
                config.lnorm_config,
                size=q_lnorm_size,
                elementwise_affine=config.lnorm_config.with_affine,
            )
        if self.add_k_lnorm:
            self.k_lnorm = build_layer_norm(
                config.lnorm_config,
                size=kv_lnorm_size,
                elementwise_affine=config.lnorm_config.with_affine,
            )
        if self.add_v_lnorm:
            self.v_lnorm = build_layer_norm(
                config.lnorm_config,
                size=kv_lnorm_size,
                elementwise_affine=config.lnorm_config.with_affine,
            )
        if config.inner_lnorm:
            self.inner_lnorm = build_layer_norm(
                config.lnorm_config,
                size=(self.n_heads * self.head_dim),
                elementwise_affine=config.lnorm_config.with_affine,
            )
        self.drop_p = config.dropout
        self.attn_interface, *_ = self._get_attention_interface(
            attn_implementation or 'eager', None, None
        )

    def _get_attention_interface(
        self,
        attn_implementation: str,
        attn_mask: Optional[torch.Tensor] = None,
        is_causal: Optional[bool] = None,
    ) -> Tuple[Callable, Optional[torch.Tensor], Optional[bool]]:
        if 'flash' in attn_implementation and self.fp32_attn:
            raise ValueError('Flash attention does not support fp32 attention')
        if self.sliding_window != -1 and 'flash' not in attn_implementation:
            raise ValueError('Sliding window attention requires flash attention')

        attn_interface: Callable = eager_attention_forward
        if attn_implementation != 'eager':
            attn_interface = ALL_ATTENTION_FUNCTIONS[attn_implementation]

        setattr(self.config, '_attn_implementation', attn_implementation)

        if 'flash' in attn_implementation:
            # Flash attention expects attention mask to be a 2D padding only
            # mask
            # Depending on the value of is_causal, the function will
            # automatically apply causal masking or not
            if attn_mask is not None:
                # convert to 0,1 in int32
                attn_mask = (attn_mask > -1).to(torch.int32)
                # take maximum along sequence dimension
                attn_mask = attn_mask.squeeze(1).max(dim=1)[0]
        elif 'sdpa' in attn_implementation:
            if attn_mask is not None and is_causal is not None:
                is_causal = False

        elif attn_implementation == 'eager':
            if is_causal:
                assert attn_mask is not None
                assert attn_mask.ndim == 4

        return attn_interface, attn_mask, is_causal

    def forward(
        self,
        xq: torch.Tensor,
        xk: Optional[torch.Tensor] = None,
        rope_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        attn_implementation: Optional[str] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        if self.self_attn:
            # qkv = self.qkv_w(xq)
            qkv_b = torch.cat((self.q_b, self.k_b, self.v_b))
            # qkv += qkv_b
            qkv = f.linear(xq, self.qkv_w.weight, qkv_b)
            if self.clip_qkv is not None:
                qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
            q, k, v = qkv.split(self.fused_dims, dim=-1)
        else:
            assert xk is not None
            q = f.linear(xq, self.q_w.weight, self.q_b)
            kv_b = torch.cat((self.k_b, self.v_b))
            kv = f.linear(xk, self.kv_w.weight, kv_b)
            if self.clip_qkv is not None:
                q.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
                kv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
            k, v = kv.split(self.fused_dims[1:], dim=-1)

        b, tq, _ = q.size()
        _, tk, __ = k.size()  # batch size, sequence length, d_model
        og_dtype = k.dtype
        if self.fp32_attn:
            dtype = torch.float32
            q = q.to(torch.float)
            k = k.to(torch.float)
        else:
            dtype = og_dtype

        # Optionally apply layer norm to keys and queries
        if not self.qkv_lnorm_on_heads:
            q = self.q_lnorm(q).to(dtype=dtype)
            k = self.k_lnorm(k).to(dtype=dtype)
            v = self.v_lnorm(v).to(dtype=dtype)

        # Move head forward to be next to the batch dim
        # shape: (bs, nh, t, hs)
        q = q.view(b, tq, self.n_heads, -1).transpose(1, 2)
        # shape: (b, n_kv_h, t, hs)
        k = k.view(b, tk, self.n_kv_heads, -1).transpose(1, 2)
        # shape: (b, n_kv_h, t, hs)
        v = v.view(b, tk, self.n_kv_heads, -1).transpose(1, 2)

        # Optionaly apply layer norm to keys and queries
        if self.qkv_lnorm_on_heads:
            q = self.q_lnorm(q).to(dtype=dtype)
            k = self.k_lnorm(k).to(dtype=dtype)
            v = self.v_lnorm(v).to(dtype=dtype)

        cache_kwargs: Dict[str, torch.Tensor] = {'cache_position': cache_position}
        if rope_embeddings is not None:
            cos, sin = rope_embeddings
            cache_kwargs['cos'] = cos
            cache_kwargs['sin'] = sin
            cos = cos.unsqueeze(1)
            sin = sin.unsqueeze(1)
            q, k = apply_rope_to_qk(q, k, cos, sin)

        if past_key_values is not None:
            k, v = past_key_values.update(k, v, self.layer_idx, cache_kwargs)

        if attention_mask is not None:
            # Resize and cast attention bias.
            # The current dtype of the attention bias might not match the dtype that the
            # SDP attn function will run in if AMP is enabled, and this can be a problem
            # if some tokens are masked out due to padding as down-casting the attention
            # bias to the autocast precision will result in -infs, which will cause the
            # SDP attn function to produce NaNs.
            qlen, klen = q.shape[-2], k.shape[-2]
            attention_mask = cast_attention_mask(
                attention_mask[:, :, klen - qlen : klen, :klen], dtype
            )

        attention_interface = self.attn_interface
        is_causal = self.is_causal
        if attn_implementation is not None:
            attention_interface, attention_mask, is_causal = (
                self._get_attention_interface(
                    attn_implementation,
                    attention_mask,
                    self.is_causal,
                )
            )

        if self.sliding_window != -1:
            kwargs['sliding_window'] = self.sliding_window
        if is_causal is not None:
            kwargs['is_causal'] = is_causal

        attn, weights = attention_interface(
            self,
            q,
            k,
            v,
            attention_mask,
            dropout=0.0 if not self.training else self.drop_p,
            scaling=self.scaling,
            **kwargs,
        )
        attn = attn.to(og_dtype)
        attn = attn.view(b, tq, -1)
        out = self.inner_lnorm(attn)
        out = self.out(out)

        return out, weights


"""
FFN module. Adapted from AllenAI Molmo https://github.com/allenai/molmo
"""


class FFN(nn.Module):
    """Feed-Forward Network."""

    def __init__(
        self,
        config: JinaFFNConfig,
        hidden_size: int,
        output_size: Optional[int] = None,
        layer_idx: int = 0,
    ):
        super().__init__()
        self.config = config
        self.hidden_size = hidden_size
        self.output_size = output_size or hidden_size
        self.intermediate_size = config.size or config.ratio * hidden_size
        self.layer_idx = layer_idx
        self.gated_activation = config.gated_activation
        self.use_bias = config.bias

        activation_type = config.activation_type.lower()
        self.act = ACT2FN[activation_type]

        intermediate_size = self.intermediate_size
        if self.gated_activation:
            intermediate_size = 2 * self.intermediate_size

        self.up = nn.Linear(self.hidden_size, intermediate_size, bias=self.use_bias)
        self.down = nn.Linear(
            self.intermediate_size, self.output_size, bias=self.use_bias
        )
        self.inner_lnorm = (
            build_layer_norm(self.config.lnorm_config, self.intermediate_size)
            if config.inner_lnorm
            else nn.Identity()
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.gated_activation:
            x = self.up(x)
            x, gate = x.chunk(2, dim=-1)
            return self.down(self.inner_lnorm(self.act(gate) * x))
        return self.down(self.inner_lnorm(self.act(self.up(x))))


"""
Transformer block. Adapted from AllenAI Molmo https://github.com/allenai/molmo
"""


class TransformerBlock(GradientCheckpointingLayer):
    def __init__(
        self,
        config: JinaTransformerBlockConfig,
        hidden_size: int,
        is_causal: bool = True,
        layer_idx: int = 0,
        attn_implementation: Optional[str] = None,
    ):
        super().__init__()
        self.config = config
        self.hidden_size = hidden_size
        self.is_causal = is_causal
        self.layer_idx = layer_idx
        self.drop_path = config.residual_path_dropout
        self.attn_lscale_init = config.attn_lscale_init
        self.ffn_lscale_init = config.ffn_lscale_init
        self.postnorm = config.postnorm

        self.attn = MHSDPA(
            config.attn_config,
            hidden_size=self.hidden_size,
            is_causal=is_causal,
            self_attn=True,
            layer_idx=layer_idx,
            attn_implementation=attn_implementation,
        )
        self.ffn = FFN(
            config.ffn_config, hidden_size=self.hidden_size, layer_idx=layer_idx
        )
        self.attn_drop = Dropout(
            config.residual_dropout, mask_p=config.residual_response_dropout
        )
        self.ffn_drop = Dropout(
            config.residual_dropout, mask_p=config.residual_response_dropout
        )
        self.path_drop = (
            ResidualPathDropout(self.drop_path)
            if self.drop_path > 0.0
            else nn.Identity()
        )
        self.attn_lnorm = build_layer_norm(config.lnorm_config, size=hidden_size)
        self.ffn_lnorm = build_layer_norm(config.lnorm_config, size=hidden_size)
        self.attn_lscale = nn.Identity()
        self.ffn_lscale = nn.Identity()
        if self.attn_lscale_init is not None:
            self.attn_lscale = LayerScale(self.hidden_size, self.attn_lscale_init)
        if self.ffn_lscale_init is not None:
            self.ffn_lscale = LayerScale(self.hidden_size, self.ffn_lscale_init)

    def forward(
        self,
        x: torch.Tensor,
        rope_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        drop_mask: Optional[torch.Tensor] = None,
        attn_implementation: Optional[str] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        if not self.postnorm:
            x_norm = self.attn_lnorm(x)
        else:
            x_norm = x

        x_attn, x_attn_weights = self.attn(
            x_norm,
            rope_embeddings=rope_embeddings,
            attention_mask=attention_mask,
            past_key_values=past_key_values,
            cache_position=cache_position,
            attn_implementation=attn_implementation,
            **kwargs,
        )
        if self.postnorm:
            x_attn = self.attn_lnorm(x_attn)

        x_attn = self.path_drop(self.attn_lscale(x_attn))
        x = x + self.attn_drop(x_attn, drop_mask=drop_mask)

        if not self.postnorm:
            x_norm = self.ffn_lnorm(x)
        else:
            x_norm = x

        x_ffn = self.ffn(x_norm)
        if self.postnorm:
            x_ffn = self.ffn_lnorm(x)

        x_ffn = self.path_drop(self.ffn_lscale(x_ffn))
        x = x + self.ffn_drop(x_ffn, drop_mask=drop_mask)

        return x, x_attn_weights


"""
Vision Language Connector. Adapted from AllenAI Molmo https://github.com/allenai/molmo
"""


class VisionLanguageConnector(GradientCheckpointingLayer):
    """Vision-Language Connector."""

    def __init__(
        self,
        config: JinaVLConnectorConfig,
        input_size: int,
        intermediate_size: int,
        output_size: int,
        n_patches: Tuple[int, int],
        attn_implementation: Optional[str] = None,
    ):
        super().__init__()
        self.config = config
        self.input_size = input_size
        self.intermediate_size = intermediate_size
        self.output_size = output_size
        self.n_patches = n_patches

        self.padding_embed_type = config.padding_embed_type
        self.pooling_type = config.pooling_type
        self.projector_type = config.projector_type
        self.spatial_merge_size = config.spatial_merge_size
        self.pooling_h = config.pooling_h
        self.pooling_w = config.pooling_w
        self.pad_embed = None
        self.pooling = None
        self.projector: Union[nn.Linear, nn.ModuleList, FFN]

        if config.padding_embed_type is not None:
            if config.padding_embed_type in {
                ImagePaddingEmbedType.regress,
                ImagePaddingEmbedType.pad_embed,
            }:
                self.pad_embed = nn.Parameter(torch.zeros((self.input_size,)))
            else:
                self.pad_embed = nn.Parameter(torch.zeros((2, self.input_size)))

        pooling_input_size = self.input_size
        projector_input_size = self.intermediate_size
        if config.pooling_type in {
            ImagePooling2DType.attention,
            ImagePooling2DType.attention_meanq,
            ImagePooling2DType.attention_2wide,
        }:
            assert config.attn_pooling_config is not None
            if config.pooling_type == ImagePooling2DType.attention_2wide:
                pooling_input_size *= 2

            # Flash Attention can cause Inf grads in the attention pooling layer
            # because of very large batch sizes. Setting this to sdpa does not cost us
            # much since sequence lengths in the case of attention pooling are very
            # small
            attn_implementation = attn_implementation or 'eager'
            if attn_implementation.startswith('flash'):
                attn_implementation = 'sdpa'
            self.pooling = MHSDPA(
                config.attn_pooling_config,
                hidden_size=pooling_input_size,
                is_causal=False,
                self_attn=False,
                output_size=projector_input_size,
                attn_implementation=attn_implementation,
            )
        elif config.pooling_type in [
            ImagePooling2DType.stack,
            ImagePooling2DType.token_merger,
        ]:
            projector_input_size *= config.pooling_h * config.pooling_w

        if config.projector_type in {
            ImageProjectionType.mlpx2,
            ImageProjectionType.mlp,
        }:
            assert config.mlp_projector_config is not None
            mlp_projector_kwargs = dict(
                config=config.mlp_projector_config,
                hidden_size=projector_input_size,
                output_size=output_size,
            )
            if config.projector_type == ImageProjectionType.mlpx2:
                # TODO: Before there were two dropouts applied
                self.projector = nn.ModuleList(
                    [FFN(**mlp_projector_kwargs), Residual(FFN(**mlp_projector_kwargs))]
                )
            else:
                self.projector = FFN(**mlp_projector_kwargs)
        else:
            self.projector = nn.Linear(
                projector_input_size,
                output_size,
                bias=False,
            )
        self.projector_dropout = Dropout(config.projector_dropout)
        self.feature_dropout = Dropout(config.feature_dropout)

    def forward(
        self,
        image_features: torch.Tensor,
        image_masks: Optional[torch.Tensor] = None,
        attn_implementation: Optional[str] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # image_features:
        # (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
        bs, ncrops = image_features.shape[:2]
        ogtype = image_features.dtype

        if self.padding_embed_type is not None:
            assert image_masks is not None
            if self.padding_embed_type == ImagePaddingEmbedType.pad_embed:
                all_pad = (image_masks == 0).to(dtype=torch.float32)
                pad_embed = self.pad_embed[None, None, None, :]
                image_features = image_features + pad_embed * torch.unsqueeze(
                    all_pad, -1
                )
            elif self.padding_embed_type == ImagePaddingEmbedType.regress:
                pad_embed = self.pad_embed[None, None, None, :]
                image_features = image_features + pad_embed * torch.unsqueeze(
                    torch.maximum(image_masks, torch.zeros_like(image_masks)), -1
                )
            else:
                pad_embed = self.pad_embed[:, None, None, None, :]
                all_pad = image_masks == 0
                partial_pad = torch.logical_and(
                    image_masks < 1, torch.logical_not(all_pad)
                ).to(dtype=torch.float32)
                all_pad = all_pad.to(dtype=torch.float32)
                image_features = image_features + pad_embed[0] * torch.unsqueeze(
                    all_pad, -1
                )
                image_features = image_features + pad_embed[1] * torch.unsqueeze(
                    partial_pad, -1
                )

        image_features = image_features.to(dtype=ogtype)
        image_features = self.feature_dropout(image_features)
        image_features = image_features.reshape((bs, ncrops) + self.n_patches + (-1,))
        pad_h = self.n_patches[0] % self.pooling_h
        pad_w = self.n_patches[1] % self.pooling_w
        if pad_h != 0 or pad_w != 0:
            # Pad so we can still pool mxn patches
            image_features = f.pad(
                image_features,
                (0, 0, 0, pad_w, 0, pad_h, 0, 0, 0, 0),
            )
        if self.pooling_type == ImagePooling2DType.token_merger:
            context_dim = image_features.shape[-1]
            hidden_size = context_dim * (self.spatial_merge_size**2)
            image_features = image_features.view([-1, hidden_size])
        else:
            image_features = einops.rearrange(
                image_features,
                'b n (h dh) (w dw) c -> (b n h w) (dh dw) c',
                dh=self.pooling_h,
                dw=self.pooling_w,
            )
            image_features = image_features.contiguous()
            if self.pooling_type == ImagePooling2DType.attention_meanq:
                query = image_features.mean(-2, keepdim=True)
                # Flash Attention can cause Inf grads in the attention pooling layer
                # because of very large batch sizes. Setting this to sdpa does not cost
                # us much since sequence lengths in the case of attention pooling are
                # very small
                attn_implementation = attn_implementation or 'eager'
                if attn_implementation.startswith('flash'):
                    attn_implementation = 'sdpa'
                if attn_implementation == 'sdpa':
                    with sdpa_kernel(backends=[SDPBackend.MATH]):
                        image_features, _ = self.pooling(
                            xq=query,
                            xk=image_features,
                            attn_implementation='sdpa',
                            **kwargs,
                        )
                else:
                    image_features, _ = self.pooling(
                        xq=query,
                        xk=image_features,
                        attn_implementation=attn_implementation,
                        **kwargs,
                    )
            elif self.pooling_type not in {
                ImagePooling2DType.none,
                ImagePooling2DType.stack,
            }:
                image_features = self.pooling(image_features[:, :1, :], image_features)

        h = self.n_patches[0] // self.pooling_h + pad_h
        w = self.n_patches[1] // self.pooling_w + pad_w

        image_features = image_features.reshape(bs, ncrops, h * w, -1)

        return self.projector(image_features)