Source code for vis4d.op.layer.attention

"""Attention layer."""

from __future__ import annotations

from torch import Tensor, nn

from vis4d.common.logging import rank_zero_warn
from vis4d.common.typing import ArgsType


[docs] class Attention(nn.Module): """ViT Attention Layer. Modified from timm (https://github.com/huggingface/pytorch-image-models). """ def __init__( self, dim: int, num_heads: int = 8, qkv_bias: bool = False, attn_drop: float = 0.0, proj_drop: float = 0.0, ) -> None: """Init attention layer. Args: dim (int): Input tensor's dimension. num_heads (int, optional): Number of attention heads. Defaults to 8. qkv_bias (bool, optional): If to add bias to qkv. Defaults to False. attn_drop (float, optional): Dropout rate for attention. Defaults to 0.0. proj_drop (float, optional): Dropout rate for projection. Defaults to 0.0. """ super().__init__() assert dim % num_heads == 0, "dim should be divisible by num_heads" self.num_heads = num_heads head_dim = dim // num_heads self.scale = head_dim**-0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop)
[docs] def __call__(self, data: Tensor) -> Tensor: """Applies the layer. Args: data (Tensor): Input tensor of shape (B, N, dim). Returns: Tensor: Output tensor of the same shape as input. """ return self._call_impl(data)
[docs] def forward(self, x: Tensor) -> Tensor: """Forward pass.""" batch_size, num_samples, dim = x.shape qkv = ( self.qkv(x) .reshape( batch_size, num_samples, 3, self.num_heads, dim // self.num_heads, ) .permute(2, 0, 3, 1, 4) ) q, k, v = qkv.unbind( 0 ) # make torchscript happy (cannot use tensor as tuple) attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(batch_size, num_samples, dim) x = self.proj(x) x = self.proj_drop(x) return x
[docs] class MultiheadAttention(nn.Module): """A wrapper for ``torch.nn.MultiheadAttention``. This module implements MultiheadAttention with identity connection, and positional encoding is also passed as input. """ def __init__( self, embed_dims: int, num_heads: int, attn_drop: float = 0.0, proj_drop: float = 0.0, dropout_layer: nn.Module | None = None, batch_first: bool = False, **kwargs: ArgsType, ) -> None: """Init MultiheadAttention. Args: embed_dims (int): The embedding dimension. num_heads (int): Parallel attention heads. attn_drop (float): A Dropout layer on attn_output_weights. Default: 0.0. proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. Default: 0.0. dropout_layer (nn.Module | None, optional): The dropout_layer used when adding the shortcut. Defaults to None. batch_first (bool): When it is True, Key, Query and Value are shape of (batch, n, embed_dim), otherwise (n, batch, embed_dim). Default to False. """ super().__init__() self.batch_first = batch_first self.embed_dims = embed_dims self.attn = nn.MultiheadAttention( embed_dims, num_heads, dropout=attn_drop, **kwargs ) self.proj_drop = nn.Dropout(proj_drop) self.dropout_layer = dropout_layer or nn.Identity()
[docs] def forward( self, query: Tensor, key: Tensor | None = None, value: Tensor | None = None, identity: Tensor | None = None, query_pos: Tensor | None = None, key_pos: Tensor | None = None, attn_mask: Tensor | None = None, key_padding_mask: Tensor | None = None, ) -> Tensor: """Forward function for `MultiheadAttention`. **kwargs allow passing a more general data flow when combining with other operations in `transformerlayer`. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . If None, the ``query`` will be used. Defaults to None. value (Tensor): The value tensor with same shape as `key`. Same in `nn.MultiheadAttention.forward`. Defaults to None. If None, the `key` will be used. identity (Tensor): This tensor, with the same shape as x, will be used for the identity link. If None, `x` will be used. Defaults to None. query_pos (Tensor): The positional encoding for query, with the same shape as `x`. If not None, it will be added to `x` before forward function. Defaults to None. key_pos (Tensor): The positional encoding for `key`, with the same shape as `key`. Defaults to None. If not None, it will be added to `key` before forward function. If None, and `query_pos` has the same shape as `key`, then `query_pos` will be used for `key_pos`. Defaults to None. attn_mask (Tensor): ByteTensor mask with shape [num_queries, num_keys]. Same in `nn.MultiheadAttention.forward`. Defaults to None. key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. Defaults to None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries, embed_dims]. """ if key is None: key = query if value is None: value = key if identity is None: identity = query if key_pos is None and query_pos is not None: # use query_pos if key_pos is not available if query_pos.shape == key.shape: key_pos = query_pos else: rank_zero_warn( "position encoding of key is" + f"missing in {self.__class__.__name__}." ) if query_pos is not None: query = query + query_pos if key_pos is not None: key = key + key_pos # Because the dataflow('key', 'query', 'value') of # ``torch.nn.MultiheadAttention`` is (num_query, batch, # embed_dims), We should adjust the shape of dataflow from # batch_first (batch, num_query, embed_dims) to num_query_first # (num_query, batch, embed_dims), and recover ``attn_output`` # from num_query_first to batch_first. if self.batch_first: query = query.transpose(0, 1) key = key.transpose(0, 1) value = value.transpose(0, 1) out = self.attn( query=query, key=key, value=value, attn_mask=attn_mask, key_padding_mask=key_padding_mask, )[0] if self.batch_first: out = out.transpose(0, 1) return identity + self.dropout_layer(self.proj_drop(out))