Source code for vis4d.op.layer.transformer
"""Transformer layer.
Modified from timm (https://github.com/huggingface/pytorch-image-models) and
mmdetection (https://github.com/open-mmlab/mmdetection).
"""
from __future__ import annotations
import copy
import torch
from torch import Tensor, nn
from .attention import Attention
from .drop import DropPath
from .mlp import TransformerBlockMLP
from .util import build_activation_layer
[docs]
def inverse_sigmoid(x: Tensor, eps: float = 1e-5) -> Tensor:
"""Inverse function of sigmoid.
Args:
x (Tensor): The tensor to do the inverse.
eps (float): EPS avoid numerical overflow. Defaults 1e-5.
Returns:
Tensor: The x has passed the inverse function of sigmoid, has same
shape with input.
"""
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2)
[docs]
def get_clones(module: nn.Module, num: int) -> nn.ModuleList:
"""Create N identical layers."""
return nn.ModuleList([copy.deepcopy(module) for _ in range(num)])
[docs]
class LayerScale(nn.Module):
"""Layer scaler."""
def __init__(
self,
dim: int,
inplace: bool = False,
data_format: str = "channels_last",
init_values: float = 1e-5,
):
"""Init layer scaler.
Args:
dim (int): Input tensor's dimension.
inplace (bool): Whether performs operation in-place. Default:
False.
data_format (str): The input data format, could be 'channels_last'
or 'channels_first', representing (B, C, H, W) and (B, N, C)
format data respectively. Default: channels_last.
init_values (float, optional): Initial values for layer scale.
Defaults to 1e-5.
"""
super().__init__()
assert data_format in {
"channels_last",
"channels_first",
}, "data_format could only be channels_last or channels_first."
self.inplace = inplace
self.data_format = data_format
self.gamma = nn.Parameter(init_values * torch.ones(dim))
[docs]
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Forward pass."""
if self.data_format == "channels_first":
shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2))))
else:
shape = tuple((*(1 for _ in range(x.dim() - 1)), -1))
if self.inplace:
return x.mul_(self.gamma.view(*shape))
return x * self.gamma.view(*shape)
[docs]
class TransformerBlock(nn.Module):
"""Transformer block for Vision Transformer."""
def __init__(
self,
dim: int,
num_heads: int,
mlp_ratio: float = 4.0,
qkv_bias: bool = False,
drop: float = 0.0,
attn_drop: float = 0.0,
init_values: float | None = None,
drop_path: float = 0.0,
act_layer: nn.Module = nn.GELU(),
norm_layer: nn.Module | None = None,
):
"""Init transformer block.
Args:
dim (int): Input tensor's dimension.
num_heads (int): Number of attention heads.
mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding
dim. Defaults to 4.0.
qkv_bias (bool, optional): If to add bias to qkv. Defaults to
False.
drop (float, optional): Dropout rate for attention and projection.
Defaults to 0.0.
attn_drop (float, optional): Dropout rate for attention. Defaults
to 0.0.
init_values (tuple[float, float] | None, optional): Initial values
for layer scale. Defaults to None.
drop_path (float, optional): Dropout rate for drop path. Defaults
to 0.0.
act_layer (nn.Module, optional): Activation layer. Defaults to
nn.GELU.
norm_layer (nn.Module, optional): Normalization layer. If None, use
nn.LayerNorm.
"""
super().__init__()
self.norm1 = (
norm_layer(dim) if norm_layer else nn.LayerNorm(dim, eps=1e-6)
)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
attn_drop=attn_drop,
proj_drop=drop,
)
self.ls1 = (
LayerScale(dim, init_values=init_values)
if init_values
else nn.Identity()
)
self.drop_path1 = (
DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
)
self.norm2 = (
norm_layer(dim) if norm_layer else nn.LayerNorm(dim, eps=1e-6)
)
self.mlp = TransformerBlockMLP(
in_features=dim,
hidden_features=int(dim * mlp_ratio),
act_layer=act_layer,
drop=drop,
)
self.ls2 = (
LayerScale(dim, init_values=init_values)
if init_values
else nn.Identity()
)
self.drop_path2 = (
DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
)
[docs]
def __call__(self, data: torch.Tensor) -> torch.Tensor:
"""Forward pass.
Args:
data (torch.Tensor): Input tensor of shape (B, N, dim).
Returns:
torch.Tensor: Output tensor of shape (B, N, dim).
"""
return self._call_impl(data)
[docs]
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Forward pass."""
x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
return x
[docs]
class FFN(nn.Module):
"""Implements feed-forward networks (FFNs) with identity connection."""
def __init__(
self,
embed_dims: int = 256,
feedforward_channels: int = 1024,
num_fcs: int = 2,
dropout: float = 0.0,
activation: str = "ReLU",
inplace: bool = True,
dropout_layer: nn.Module | None = None,
add_identity: bool = True,
layer_scale_init_value: float = 0.0,
) -> None:
"""Init FFN.
Args:
embed_dims (int): The feature dimension. Defaults: 256.
feedforward_channels (int): The hidden dimension of FFNs.
Defaults: 1024.
num_fcs (int): The number of fully-connected layers in FFNs.
Defaults: 2.
dropout (float): The dropout rate of FFNs.
activation (str): The activation function of FFNs.
inplace (bool): Whether to set inplace for activation.
dropout_layer (nn.Module | None, optional): The dropout_layer used
when adding the shortcut. Defaults to None. If None, Identity
is used.
add_identity (bool, optional): Whether to add the identity
connection. Default: True.
layer_scale_init_value (float): Initial value of scale factor in
LayerScale. Default: 0.0
"""
super().__init__()
layers: list[nn.Module] = []
in_channels = embed_dims
for _ in range(num_fcs - 1):
layers.append(
nn.Sequential(
nn.Linear(in_channels, feedforward_channels),
build_activation_layer(activation, inplace),
nn.Dropout(dropout),
)
)
in_channels = feedforward_channels
layers.append(nn.Linear(feedforward_channels, embed_dims))
layers.append(nn.Dropout(dropout))
self.layers = nn.Sequential(*layers)
self.dropout_layer = dropout_layer or nn.Identity()
self.add_identity = add_identity
self.layer_scale_init_value = layer_scale_init_value
if self.layer_scale_init_value > 0:
self.gamma2 = LayerScale(
embed_dims, init_values=self.layer_scale_init_value
)
[docs]
def forward(self, x: Tensor, identity: Tensor | None = None) -> None:
"""Forward function for FFN.
The function would add x to the output tensor if residue is None.
"""
out = self.layers(x)
if self.layer_scale_init_value > 0:
out = self.gamma2(out)
if self.add_identity:
identity = x if identity is None else identity
return identity + self.dropout_layer(out)
return self.dropout_layer(out)