Source code for vis4d.op.detect.mask_rcnn

"""Mask RCNN detector."""

from __future__ import annotations

from typing import NamedTuple, Protocol

import torch
import torch.nn.functional as F
from torch import Tensor, nn
from torchvision.ops import roi_align

from vis4d.op.box.box2d import apply_mask
from vis4d.op.box.poolers import MultiScaleRoIAlign
from vis4d.op.mask.util import paste_masks_in_image, remove_overlap

from ..typing import Proposals, Targets



[docs]
class MaskRCNNHeadOut(NamedTuple):
    """Mask R-CNN RoI head outputs."""

    # logits for mask prediction. The dimension is number of masks x number of
    # classes x H_mask x W_mask
    mask_pred: list[torch.Tensor]




[docs]
class MaskRCNNHead(nn.Module):
    """Mask R-CNN RoI head.

    Args:
        num_classes (int, optional): Number of classes. Defaults to 80.
        num_convs (int, optional): Number of convolution layers. Defaults to 4.
        roi_size (tuple[int, int], optional): Size of RoI after pooling.
            Defaults to (14, 14).
        in_channels (int, optional): Input feature channels. Defaults to 256.
        conv_kernel_size (int, optional): Kernel size of convolution. Defaults
            to 3.
        conv_out_channels (int, optional): Output channels of convolution.
            Defaults to 256.
        scale_factor (int, optional): Scaling factor of upsampling. Defaults
            to 2.
        class_agnostic (bool, optional): Whether to do class agnostic mask
            prediction. Defaults to False.
    """

    def __init__(
        self,
        num_classes: int = 80,
        num_convs: int = 4,
        roi_size: tuple[int, int] = (14, 14),
        in_channels: int = 256,
        conv_kernel_size: int = 3,
        conv_out_channels: int = 256,
        scale_factor: int = 2,
        class_agnostic: bool = False,
    ) -> None:
        """Creates an instance of the class."""
        super().__init__()
        self.roi_pooler = MultiScaleRoIAlign(
            sampling_ratio=0, resolution=roi_size, strides=[4, 8, 16, 32]
        )

        self.convs = nn.ModuleList()
        for i in range(num_convs):
            in_channels = in_channels if i == 0 else conv_out_channels
            padding = (conv_kernel_size - 1) // 2
            self.convs.append(
                nn.Conv2d(
                    in_channels,
                    conv_out_channels,
                    conv_kernel_size,
                    padding=padding,
                )
            )

        upsample_in_channels = (
            conv_out_channels if num_convs > 0 else in_channels
        )
        self.upsample = nn.ConvTranspose2d(
            upsample_in_channels,
            conv_out_channels,
            scale_factor,
            stride=scale_factor,
        )

        out_channels = 1 if class_agnostic else num_classes
        self.conv_logits = nn.Conv2d(conv_out_channels, out_channels, 1)
        self.relu = nn.ReLU(inplace=True)

        self._init_weights(self.convs)
        self._init_weights(self.upsample, mode="fan_out")
        self._init_weights(self.conv_logits, mode="fan_out")

    @staticmethod
    def _init_weights(module: nn.Module, mode: str = "fan_in") -> None:
        """Initialize weights."""
        if hasattr(module, "weight") and hasattr(module, "bias"):
            assert isinstance(module.weight, torch.Tensor) and isinstance(
                module.bias, torch.Tensor
            )
            nn.init.kaiming_normal_(
                module.weight, mode=mode, nonlinearity="relu"
            )
            nn.init.constant_(module.bias, 0)


[docs]
    def forward(
        self, features: list[torch.Tensor], boxes: list[torch.Tensor]
    ) -> MaskRCNNHeadOut:
        """Forward pass.

        Args:
            features (list[torch.Tensor]): Feature pyramid.
            boxes (list[torch.Tensor]): Proposal boxes.

        Returns:
            MaskRCNNHeadOut: Mask prediction outputs.
        """
        # Take stride 4, 8, 16, 32 features
        mask_feats = self.roi_pooler(features[2:6], boxes)
        for conv in self.convs:
            mask_feats = self.relu(conv(mask_feats))
        mask_feats = self.relu(self.upsample(mask_feats))
        mask_pred = self.conv_logits(mask_feats)
        num_dets_per_img = tuple(len(d) for d in boxes)
        mask_preds = mask_pred.split(num_dets_per_img, 0)
        return MaskRCNNHeadOut(mask_pred=mask_preds)





[docs]
class MaskOut(NamedTuple):
    """Output of the final detections from Mask RCNN."""

    masks: list[torch.Tensor]  # N, H, W
    scores: list[torch.Tensor]
    class_ids: list[torch.Tensor]




[docs]
class Det2Mask(nn.Module):
    """Post processing of mask predictions.

    Args:
        mask_threshold (float, optional): Positive threshold. Defaults to 0.5.
        no_overlap (bool, optional): Whether to remove overlapping pixels
            between masks. Defaults to False.
    """

    def __init__(
        self, mask_threshold: float = 0.5, no_overlap: bool = False
    ) -> None:
        """Creates an instance of the class."""
        super().__init__()
        self.mask_threshold = mask_threshold
        self.no_overlap = no_overlap


[docs]
    def forward(
        self,
        mask_outs: list[torch.Tensor],
        det_boxes: list[torch.Tensor],
        det_scores: list[torch.Tensor],
        det_class_ids: list[torch.Tensor],
        original_hw: list[tuple[int, int]],
    ) -> MaskOut:
        """Paste mask predictions back into original image resolution.

        Args:
            mask_outs (list[torch.Tensor]): List of mask outputs for each batch
                element.
            det_boxes (list[torch.Tensor]): List of detection boxes for each
                batch element.
            det_scores (list[torch.Tensor]): List of detection scores for each
                batch element.
            det_class_ids (list[torch.Tensor]): List of detection classeds for
                each batch element.
            original_hw (list[tuple[int, int]]): Original image resolution.

        Returns:
            MaskOut: Post-processed mask predictions.
        """
        all_masks = []
        all_scores = []
        all_class_ids = []
        for mask_out, boxes, scores, class_ids, orig_hw in zip(
            mask_outs, det_boxes, det_scores, det_class_ids, original_hw
        ):
            pasted_masks = paste_masks_in_image(
                mask_out[torch.arange(len(mask_out)), class_ids],
                boxes,
                orig_hw[::-1],
                self.mask_threshold,
            )
            if self.no_overlap:
                pasted_masks = remove_overlap(pasted_masks, scores)
            all_masks.append(pasted_masks)
            all_scores.append(scores)
            all_class_ids.append(class_ids)
        return MaskOut(
            masks=all_masks, scores=all_scores, class_ids=all_class_ids
        )



[docs]
    def __call__(
        self,
        mask_outs: list[torch.Tensor],
        det_boxes: list[torch.Tensor],
        det_scores: list[torch.Tensor],
        det_class_ids: list[torch.Tensor],
        original_hw: list[tuple[int, int]],
    ) -> MaskOut:
        """Type definition for function call."""
        return self._call_impl(
            mask_outs, det_boxes, det_scores, det_class_ids, original_hw
        )





[docs]
class MaskRCNNHeadLosses(NamedTuple):
    """Mask RoI head loss container."""

    rcnn_loss_mask: torch.Tensor




[docs]
class MaskRCNNHeadLoss(nn.Module):
    """Mask RoI head loss function.

    Args:
        num_classes (int): number of object categories.
    """

    def __init__(self, num_classes: int) -> None:
        """Creates an instance of the class."""
        super().__init__()
        self.num_classes = num_classes

    @staticmethod
    def _get_targets_per_image(
        boxes: Tensor,
        tgt_masks: Tensor,
        out_shape: tuple[int, int],
        binarize: bool = True,
    ) -> Tensor:
        """Get aligned mask targets for each proposal.

        Args:
            boxes (Tensor): proposal boxes.
            tgt_masks (Tensor): target masks.
            out_shape (tuple[int, int]): output shape.
            binarize (bool, optional): whether to convert target mask to
                binary. Defaults to True.

        Returns:
            Tensor: aligned mask targets.
        """
        fake_inds = torch.arange(len(boxes), device=boxes.device)[:, None]
        rois = torch.cat([fake_inds, boxes], dim=1)  # Nx5
        gt_masks_th = tgt_masks[:, None, :, :].type(rois.dtype)
        targets = roi_align(
            gt_masks_th, rois, out_shape, 1.0, 0, True
        ).squeeze(1)
        resized_masks = targets >= 0.5 if binarize else targets
        return resized_masks


[docs]
    def forward(
        self,
        mask_preds: list[torch.Tensor],
        proposal_boxes: list[torch.Tensor],
        target_classes: list[torch.Tensor],
        target_masks: list[torch.Tensor],
    ) -> MaskRCNNHeadLosses:
        """Calculate losses of Mask RCNN head.

        Args:
            mask_preds (list[torch.Tensor]): [M, C, H', W'] mask outputs per
                batch element.
            proposal_boxes (list[torch.Tensor]): [M, 4] proposal boxes per
                batch element.
            target_classes (list[torch.Tensor]): list of [M, 4] assigned
                target boxes for each proposal.
            target_masks (list[torch.Tensor]): list of [M, H, W] assigned
                target masks for each proposal.

        Returns:
            MaskRCNNHeadLosses: mask loss.
        """
        mask_pred = torch.cat(mask_preds)
        mask_size = (mask_pred.shape[2], mask_pred.shape[3])
        # get targets
        targets = []
        for boxes, tgt_masks in zip(proposal_boxes, target_masks):
            if len(tgt_masks) == 0:
                targets.append(
                    torch.empty((0, *mask_size), device=tgt_masks.device)
                )
            else:
                targets.append(
                    self._get_targets_per_image(boxes, tgt_masks, mask_size)
                )
        mask_targets = torch.cat(targets)
        mask_labels = torch.cat(target_classes)

        if len(mask_targets) > 0:
            num_rois = mask_pred.shape[0]
            inds = torch.arange(
                0, num_rois, dtype=torch.long, device=mask_pred.device
            )
            pred_slice = mask_pred[inds, mask_labels[inds].long()].squeeze(1)
            loss_mask = F.binary_cross_entropy_with_logits(
                pred_slice, mask_targets.float(), reduction="mean"
            )
        else:
            loss_mask = mask_targets.sum()

        return MaskRCNNHeadLosses(rcnn_loss_mask=loss_mask)





[docs]
class MaskSampler(Protocol):
    """Type definition for mask sampler."""


[docs]
    def __call__(
        self,
        target_masks: list[Tensor],
        sampled_target_indices: list[Tensor],
        sampled_targets: Targets,
        sampled_proposals: Proposals,
    ) -> tuple[list[Tensor], list[Tensor], list[Tensor]]:
        """Type definition for function call.

        Args:
            target_masks (list[Tensor]): list of [N, H, W] target masks per
                batch element.
            sampled_target_indices (list[Tensor]): list of [M] indices of
                sampled targets per batch element.
            sampled_targets (Targets): sampled targets.
            sampled_proposals (Proposals): sampled proposals.

        Returns:
            tuple[list[Tensor], list[Tensor], list[Tensor]]: sampled masks,
                sampled target indices, sampled targets.
        """





[docs]
def positive_mask_sampler(
    target_masks: list[Tensor],
    sampled_target_indices: list[Tensor],
    sampled_targets: Targets,
    sampled_proposals: Proposals,
) -> tuple[list[Tensor], list[Tensor], list[Tensor]]:
    """Sample only positive masks from target masks.

    Args:
        target_masks (list[Tensor]): list of [N, H, W] target masks per
            batch element.
        sampled_target_indices (list[Tensor]): list of [M] indices of
            sampled targets per batch element.
        sampled_targets (Targets): sampled targets.
        sampled_proposals (Proposals): sampled proposals.

    Returns:
        tuple[list[Tensor], list[Tensor], list[Tensor]]: sampled masks,
            sampled target indices, sampled targets.
    """
    sampled_masks = apply_mask(sampled_target_indices, target_masks)[0]

    pos_proposals, pos_classes, pos_mask_targets = apply_mask(
        [torch.eq(label, 1) for label in sampled_targets.labels],
        sampled_proposals.boxes,
        sampled_targets.classes,
        sampled_masks,
    )
    return pos_proposals, pos_classes, pos_mask_targets




[docs]
class SampledMaskLoss(nn.Module):
    """Sampled Mask RCNN head loss function."""

    def __init__(
        self,
        mask_sampler: MaskSampler,
        loss: MaskRCNNHeadLoss,
    ) -> None:
        """Initialize sampled mask loss.

        Args:
            mask_sampler (MaskSampler): mask sampler.
            loss (MaskRCNNHeadLoss): mask loss.
        """
        super().__init__()
        self.loss = loss
        self.mask_sampler = mask_sampler


[docs]
    def forward(
        self,
        mask_preds: list[Tensor],
        target_masks: list[Tensor],
        sampled_target_indices: list[Tensor],
        sampled_targets: Targets,
        sampled_proposals: Proposals,
    ) -> MaskRCNNHeadLosses:
        """Calculate losses of Mask RCNN head.

        Args:
            mask_preds (list[torch.Tensor]): [M, C, H', W'] mask outputs per
                batch element.
            target_masks (list[torch.Tensor]): list of [M, H, W] assigned
                target masks for each proposal.
            sampled_target_indices (list[Tensor]): list of [M, 4] assigned
                target boxes for each proposal.
            sampled_targets (Targets): list of [M, 4] assigned
                target boxes for each proposal.
            sampled_proposals (Proposals): list of [M, 4] assigned
                target boxes for each proposal.

        Returns:
            MaskRCNNHeadLosses: mask loss.
        """
        pos_proposals, pos_classes, pos_mask_targets = self.mask_sampler(
            target_masks,
            sampled_target_indices,
            sampled_targets,
            sampled_proposals,
        )
        return self.loss(
            mask_preds, pos_proposals, pos_classes, pos_mask_targets
        )