Source code for vis4d.op.box.poolers.roi_pooler

"""Vis4D RoI Pooling module."""

from __future__ import annotations

import abc
import math

import torch
from torchvision.ops import roi_align, roi_pool

from vis4d.common import ArgsType

from .base import RoIPooler
from .utils import assign_boxes_to_levels, boxes_to_tensor


# implementation modified from:
# https://github.com/facebookresearch/detectron2/
[docs] class MultiScaleRoIPooler(RoIPooler): """Wrapper for roi pooling that supports multi-scale feature maps.""" def __init__( self, resolution: tuple[int, int], strides: list[int], canonical_box_size: int = 224, canonical_level: int = 4, aligned: bool = True, ): """Multi-scale version of arbitrary RoI pooling operations. Args: resolution: Pooler resolution. strides: Feature map strides relative to the input. The strides must be powers of 2 and a monotically decreasing geometric sequence with a factor of 1/2. canonical_box_size: Canonical box size in pixels (sqrt(box area)). The default is heuristically defined as 224 pixels in the FPN paper (based on ImageNet pre-training). canonical_level: The feature map level index from which a canonical sized box should be placed. The default is defined as level 4 (stride=16) in the FPN paper, i.e., a box of size 224x224 will be placed on the feature with stride=16. The box placement for all boxes will be determined from their sizes w.r.t canonical_box_size. For example, a box whose area is 4x that of a canonical box should be used to pool features from feature level ``canonical_level+1``. aligned (bool): For roi_align op. Shift the box coordinates it by -0.5 for a better alignment with the two neighboring pixel indices. """ super().__init__(resolution) self.canonical_level = canonical_level self.canonical_box_size = canonical_box_size self.aligned = aligned self.strides = strides # Map scale (defined as 1 / stride) to its feature map level under the # assumption that stride is a power of 2. self.scales = [1 / s for s in self.strides] min_level = -(math.log2(self.scales[0])) max_level = -(math.log2(self.scales[-1])) assert math.isclose(min_level, int(min_level)) and math.isclose( max_level, int(max_level) ), "Featuremap stride is not power of 2!" self.min_level = int(min_level) self.max_level = int(max_level) assert ( len(self.scales) == self.max_level - self.min_level + 1 ), "[ROIPooler] Sizes of input NamedTensors do not form a pyramid!" assert self.min_level >= 0 and self.min_level <= self.max_level assert self.canonical_box_size > 0
[docs] def forward( self, features: list[torch.Tensor], boxes: list[torch.Tensor] ) -> torch.Tensor: """Torchvision based roi pooling operation. Args: features: List of image feature tensors (e.g., fpn levels) - NCHW format. boxes: List of proposals (per image). Returns: torch.Tensor: NCHW format, where N = num boxes (total), HW is roi size, C is feature dim. Boxes are concatenated along dimension 0 for all batch elements. """ assert len(features) == len(self.scales), ( f"unequal value, len(strides)={len(self.scales)}, " f"but x is list of {len(features)} Tensors" ) assert len(boxes) == features[0].shape[0], ( f"unequal value, x[0] batch dim 0 is {features[0].shape[0]}, " f"but box_list has length {len(boxes)}" ) if len(boxes) == 0: return torch.zeros( (0, features[0].shape[1]) + self.resolution, device=features[0].device, dtype=features[0].dtype, ) pooler_fmt_boxes = boxes_to_tensor(boxes) if len(self.scales) == 1: return self._pooling_op( features[0], pooler_fmt_boxes, spatial_scale=self.scales[0], ) level_assignments = assign_boxes_to_levels( boxes, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level, ) num_boxes = pooler_fmt_boxes.shape[0] num_channels = features[0].shape[1] output_size = self.resolution[0] dtype, device = features[0].dtype, features[0].device output = torch.zeros( (num_boxes, num_channels, output_size, output_size), dtype=dtype, device=device, ) for level, scale in enumerate(self.scales): inds = torch.eq(level_assignments, level).nonzero()[:, 0] pooler_fmt_boxes_level = pooler_fmt_boxes[inds] pooled_features = self._pooling_op( features[level], pooler_fmt_boxes_level, spatial_scale=scale ) # Use index_put_ instead of advance indexing # avoids pytorch/issues/49852 output.index_put_((inds,), pooled_features) return output
@abc.abstractmethod def _pooling_op( self, inputs: torch.Tensor, boxes: torch.Tensor, spatial_scale: float = 1.0, ) -> torch.Tensor: """Execute pooling op defined in config.""" raise NotImplementedError
[docs] class MultiScaleRoIAlign(MultiScaleRoIPooler): """RoI Align supporting multi-scale inputs.""" def __init__( self, sampling_ratio: int, *args: ArgsType, **kwargs: ArgsType ) -> None: """Creates an instance of the class.""" super().__init__(*args, **kwargs) self.sampling_ratio = sampling_ratio def _pooling_op( self, inputs: torch.Tensor, boxes: torch.Tensor, spatial_scale: float = 1.0, ) -> torch.Tensor: """Roialign wrapper.""" return roi_align( inputs, boxes, self.resolution, spatial_scale, self.sampling_ratio, self.aligned, )
[docs] class MultiScaleRoIPool(MultiScaleRoIPooler): """RoI Pool supporting multi-scale inputs.""" def _pooling_op( self, inputs: torch.Tensor, boxes: torch.Tensor, spatial_scale: float = 1.0, ) -> torch.Tensor: """Roipool wrapper.""" return roi_pool(inputs, boxes, self.resolution, spatial_scale)