"""Positional encoding for transformer.
Modified from mmdetection (https://github.com/open-mmlab/mmdetection).
"""
import math
import torch
from torch import Tensor, nn
from .weight_init import uniform_init
[docs]
class SinePositionalEncoding(nn.Module):
"""Position encoding with sine and cosine functions.
See `End-to-End Object Detection with Transformers
<https://arxiv.org/pdf/2005.12872>`_ for details.
"""
def __init__(
self,
num_feats: int,
temperature: int = 10000,
normalize: bool = False,
scale: float = 2 * math.pi,
eps: float = 1e-6,
offset: float = 0.0,
) -> None:
"""Initialization for `SinePositionalEncoding`.
Args:
num_feats (int): The feature dimension for each position
along x-axis or y-axis. Note the final returned dimension
for each position is 2 times of this value.
temperature (int, optional): The temperature used for scaling
the position embedding. Defaults to 10000.
normalize (bool, optional): Whether to normalize the position
embedding. Defaults to False.
scale (float, optional): A scale factor that scales the position
embedding. The scale will be used only when normalize is True.
Defaults to 2*pi.
eps (float, optional): A value added to the denominator for
numerical stability. Defaults to 1e-6.
offset (float, optional): offset add to embed when do the
normalization. Defaults to 0.
"""
super().__init__()
if normalize:
assert isinstance(scale, (float, int)), (
"when normalize is set,"
"scale should be provided and in float or int type, "
f"found {type(scale)}"
)
self.num_feats = num_feats
self.temperature = temperature
self.normalize = normalize
self.scale = scale
self.eps = eps
self.offset = offset
[docs]
def forward(self, mask: Tensor) -> Tensor:
"""Forward function for `SinePositionalEncoding`.
Args:
mask (Tensor): ByteTensor mask. Non-zero values representing
ignored positions, while zero values means valid positions
for this image. Shape [bs, h, w].
Returns:
pos (Tensor): Returned position embedding with shape
[bs, num_feats*2, h, w].
"""
# For convenience of exporting to ONNX, it's required to convert
# `masks` from bool to int.
mask = mask.to(torch.int)
not_mask = 1 - mask # logical_not
y_embed = not_mask.cumsum(1, dtype=torch.float32)
x_embed = not_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
y_embed = (
(y_embed + self.offset)
/ (y_embed[:, -1:, :] + self.eps)
* self.scale
)
x_embed = (
(x_embed + self.offset)
/ (x_embed[:, :, -1:] + self.eps)
* self.scale
)
dim_t = torch.arange(
self.num_feats, dtype=torch.float32, device=mask.device
)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
# use `view` instead of `flatten` for dynamically exporting to ONNX
b, h, w = mask.size()
pos_x = torch.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
).view(b, h, w, -1)
pos_y = torch.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
).view(b, h, w, -1)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
[docs]
class LearnedPositionalEncoding(nn.Module):
"""Position embedding with learnable embedding weights."""
def __init__(
self, num_feats: int, row_num_embed: int = 50, col_num_embed: int = 50
) -> None:
"""Initialization for LearnedPositionalEncoding.
Args:
num_feats (int): The feature dimension for each position
along x-axis or y-axis. The final returned dimension for
each position is 2 times of this value.
row_num_embed (int, optional): The dictionary size of row
embeddings. Defaults to 50.
col_num_embed (int, optional): The dictionary size of col
embeddings. Defaults to 50.
"""
super().__init__()
self.row_embed = nn.Embedding(row_num_embed, num_feats)
self.col_embed = nn.Embedding(col_num_embed, num_feats)
self.num_feats = num_feats
self.row_num_embed = row_num_embed
self.col_num_embed = col_num_embed
self.init_weights()
[docs]
def init_weights(self) -> None:
"""Initialize the weights of position embedding."""
uniform_init(self.row_embed, lower=0, upper=1)
uniform_init(self.col_embed, lower=0, upper=1)
[docs]
def forward(self, mask: Tensor) -> Tensor:
"""Forward function for `LearnedPositionalEncoding`.
Args:
mask (Tensor): ByteTensor mask. Non-zero values representing
ignored positions, while zero values means valid positions
for this image. Shape [bs, h, w].
Returns:
pos (Tensor): Returned position embedding with shape
[bs, num_feats*2, h, w].
"""
h, w = mask.shape[-2:]
x = torch.arange(w, device=mask.device)
y = torch.arange(h, device=mask.device)
x_embed = self.col_embed(x)
y_embed = self.row_embed(y)
pos = (
torch.cat(
(
x_embed.unsqueeze(0).repeat(h, 1, 1),
y_embed.unsqueeze(1).repeat(1, w, 1),
),
dim=-1,
)
.permute(2, 0, 1)
.unsqueeze(0)
.repeat(mask.shape[0], 1, 1, 1)
)
return pos
[docs]
class SinePositionalEncoding3D(SinePositionalEncoding):
"""3D Position encoding with sine and cosine functions."""
[docs]
def forward(self, mask: Tensor) -> Tensor:
"""Forward function for `SinePositionalEncoding3D`.
Args:
mask (Tensor): ByteTensor mask. Non-zero values representing
ignored positions, while zero values means valid positions
for this image. Shape [bs, t, h, w].
Returns:
pos (Tensor): Returned position embedding with shape
[bs, num_feats*2, h, w].
"""
assert mask.dim() == 4, (
f"{mask.shape} should be a 4-dimensional Tensor,"
f" got {mask.dim()}-dimensional Tensor instead "
)
# For convenience of exporting to ONNX, it's required to convert
# `masks` from bool to int.
mask = mask.to(torch.int)
not_mask = 1 - mask # logical_not
z_embed = not_mask.cumsum(1, dtype=torch.float32)
y_embed = not_mask.cumsum(2, dtype=torch.float32)
x_embed = not_mask.cumsum(3, dtype=torch.float32)
if self.normalize:
z_embed = (
(z_embed + self.offset)
/ (z_embed[:, -1:, :, :] + self.eps)
* self.scale
)
y_embed = (
(y_embed + self.offset)
/ (y_embed[:, :, -1:, :] + self.eps)
* self.scale
)
x_embed = (
(x_embed + self.offset)
/ (x_embed[:, :, :, -1:] + self.eps)
* self.scale
)
dim_t = torch.arange(
self.num_feats, dtype=torch.float32, device=mask.device
)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats)
dim_t_z = torch.arange(
(self.num_feats * 2), dtype=torch.float32, device=mask.device
)
dim_t_z = self.temperature ** (
2 * (dim_t_z // 2) / (self.num_feats * 2)
)
pos_x = x_embed[:, :, :, :, None] / dim_t
pos_y = y_embed[:, :, :, :, None] / dim_t
pos_z = z_embed[:, :, :, :, None] / dim_t_z
# use `view` instead of `flatten` for dynamically exporting to ONNX
b, t, h, w = mask.size()
pos_x = torch.stack(
(pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()),
dim=5,
).view(b, t, h, w, -1)
pos_y = torch.stack(
(pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()),
dim=5,
).view(b, t, h, w, -1)
pos_z = torch.stack(
(pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()),
dim=5,
).view(b, t, h, w, -1)
pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)
return pos