Source code for vis4d.op.layer.transformer

"""Transformer layer.

Modified from timm (https://github.com/huggingface/pytorch-image-models) and
mmdetection (https://github.com/open-mmlab/mmdetection).
"""

from __future__ import annotations

import copy

import torch
from torch import Tensor, nn

from .attention import Attention
from .drop import DropPath
from .mlp import TransformerBlockMLP
from .util import build_activation_layer


[docs] def inverse_sigmoid(x: Tensor, eps: float = 1e-5) -> Tensor: """Inverse function of sigmoid. Args: x (Tensor): The tensor to do the inverse. eps (float): EPS avoid numerical overflow. Defaults 1e-5. Returns: Tensor: The x has passed the inverse function of sigmoid, has same shape with input. """ x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1 / x2)
[docs] def get_clones(module: nn.Module, num: int) -> nn.ModuleList: """Create N identical layers.""" return nn.ModuleList([copy.deepcopy(module) for _ in range(num)])
[docs] class LayerScale(nn.Module): """Layer scaler.""" def __init__( self, dim: int, inplace: bool = False, data_format: str = "channels_last", init_values: float = 1e-5, ): """Init layer scaler. Args: dim (int): Input tensor's dimension. inplace (bool): Whether performs operation in-place. Default: False. data_format (str): The input data format, could be 'channels_last' or 'channels_first', representing (B, C, H, W) and (B, N, C) format data respectively. Default: channels_last. init_values (float, optional): Initial values for layer scale. Defaults to 1e-5. """ super().__init__() assert data_format in { "channels_last", "channels_first", }, "data_format could only be channels_last or channels_first." self.inplace = inplace self.data_format = data_format self.gamma = nn.Parameter(init_values * torch.ones(dim))
[docs] def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward pass.""" if self.data_format == "channels_first": shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2)))) else: shape = tuple((*(1 for _ in range(x.dim() - 1)), -1)) if self.inplace: return x.mul_(self.gamma.view(*shape)) return x * self.gamma.view(*shape)
[docs] class TransformerBlock(nn.Module): """Transformer block for Vision Transformer.""" def __init__( self, dim: int, num_heads: int, mlp_ratio: float = 4.0, qkv_bias: bool = False, drop: float = 0.0, attn_drop: float = 0.0, init_values: float | None = None, drop_path: float = 0.0, act_layer: nn.Module = nn.GELU(), norm_layer: nn.Module | None = None, ): """Init transformer block. Args: dim (int): Input tensor's dimension. num_heads (int): Number of attention heads. mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim. Defaults to 4.0. qkv_bias (bool, optional): If to add bias to qkv. Defaults to False. drop (float, optional): Dropout rate for attention and projection. Defaults to 0.0. attn_drop (float, optional): Dropout rate for attention. Defaults to 0.0. init_values (tuple[float, float] | None, optional): Initial values for layer scale. Defaults to None. drop_path (float, optional): Dropout rate for drop path. Defaults to 0.0. act_layer (nn.Module, optional): Activation layer. Defaults to nn.GELU. norm_layer (nn.Module, optional): Normalization layer. If None, use nn.LayerNorm. """ super().__init__() self.norm1 = ( norm_layer(dim) if norm_layer else nn.LayerNorm(dim, eps=1e-6) ) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, ) self.ls1 = ( LayerScale(dim, init_values=init_values) if init_values else nn.Identity() ) self.drop_path1 = ( DropPath(drop_path) if drop_path > 0.0 else nn.Identity() ) self.norm2 = ( norm_layer(dim) if norm_layer else nn.LayerNorm(dim, eps=1e-6) ) self.mlp = TransformerBlockMLP( in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop, ) self.ls2 = ( LayerScale(dim, init_values=init_values) if init_values else nn.Identity() ) self.drop_path2 = ( DropPath(drop_path) if drop_path > 0.0 else nn.Identity() )
[docs] def __call__(self, data: torch.Tensor) -> torch.Tensor: """Forward pass. Args: data (torch.Tensor): Input tensor of shape (B, N, dim). Returns: torch.Tensor: Output tensor of shape (B, N, dim). """ return self._call_impl(data)
[docs] def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward pass.""" x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) return x
[docs] class FFN(nn.Module): """Implements feed-forward networks (FFNs) with identity connection.""" def __init__( self, embed_dims: int = 256, feedforward_channels: int = 1024, num_fcs: int = 2, dropout: float = 0.0, activation: str = "ReLU", inplace: bool = True, dropout_layer: nn.Module | None = None, add_identity: bool = True, layer_scale_init_value: float = 0.0, ) -> None: """Init FFN. Args: embed_dims (int): The feature dimension. Defaults: 256. feedforward_channels (int): The hidden dimension of FFNs. Defaults: 1024. num_fcs (int): The number of fully-connected layers in FFNs. Defaults: 2. dropout (float): The dropout rate of FFNs. activation (str): The activation function of FFNs. inplace (bool): Whether to set inplace for activation. dropout_layer (nn.Module | None, optional): The dropout_layer used when adding the shortcut. Defaults to None. If None, Identity is used. add_identity (bool, optional): Whether to add the identity connection. Default: True. layer_scale_init_value (float): Initial value of scale factor in LayerScale. Default: 0.0 """ super().__init__() layers: list[nn.Module] = [] in_channels = embed_dims for _ in range(num_fcs - 1): layers.append( nn.Sequential( nn.Linear(in_channels, feedforward_channels), build_activation_layer(activation, inplace), nn.Dropout(dropout), ) ) in_channels = feedforward_channels layers.append(nn.Linear(feedforward_channels, embed_dims)) layers.append(nn.Dropout(dropout)) self.layers = nn.Sequential(*layers) self.dropout_layer = dropout_layer or nn.Identity() self.add_identity = add_identity self.layer_scale_init_value = layer_scale_init_value if self.layer_scale_init_value > 0: self.gamma2 = LayerScale( embed_dims, init_values=self.layer_scale_init_value )
[docs] def forward(self, x: Tensor, identity: Tensor | None = None) -> None: """Forward function for FFN. The function would add x to the output tensor if residue is None. """ out = self.layers(x) if self.layer_scale_init_value > 0: out = self.gamma2(out) if self.add_identity: identity = x if identity is None else identity return identity + self.dropout_layer(out) return self.dropout_layer(out)