Source code for vis4d.op.detect3d.qd_3dt

"""QD-3DT detector."""

from __future__ import annotations

from typing import NamedTuple

import numpy as np
import torch
from torch import Tensor, nn

from vis4d.common.typing import LossesType
from vis4d.op.box.encoder.qd_3dt import QD3DTBox3DDecoder, QD3DTBox3DEncoder
from vis4d.op.box.matchers import Matcher, MaxIoUMatcher
from vis4d.op.box.poolers import MultiScaleRoIAlign, RoIPooler
from vis4d.op.box.samplers import (
    CombinedSampler,
    Sampler,
    match_and_sample_proposals,
)
from vis4d.op.geometry.rotation import generate_rotation_output
from vis4d.op.layer import Conv2d, add_conv_branch
from vis4d.op.layer.weight_init import kaiming_init, xavier_init
from vis4d.op.loss.base import Loss
from vis4d.op.loss.common import rotation_loss, smooth_l1_loss
from vis4d.op.loss.reducer import LossReducer, SumWeightedLoss, mean_loss


[docs] class QD3DTBBox3DHeadOutput(NamedTuple): """QD-3DT bounding box 3D head training output.""" predictions: list[Tensor] targets: Tensor | None labels: Tensor | None
[docs] class QD3DTDet3DOut(NamedTuple): """Output of QD-3DT bounding box 3D head. Attributes: boxes_3d (list[Tensor]): Predicted 3D bounding boxes. Each tensor has shape (N, 12) and contains x,y,z,h,w,l,rx,ry,rz,vx,vy,vz. depth_uncertainty (list[Tensor]): Predicted depth uncertainty. Each tensor has shape (N, 1). """ boxes_3d: list[Tensor] depth_uncertainty: list[Tensor]
[docs] def get_default_proposal_pooler() -> RoIPooler: """Get default proposal pooler of QD-3DT bounding box 3D head.""" return MultiScaleRoIAlign( resolution=[7, 7], strides=[4, 8, 16, 32], sampling_ratio=0 )
[docs] def get_default_box_sampler() -> CombinedSampler: """Get default box sampler of QD-3DT bounding box 3D head.""" return CombinedSampler( batch_size=512, positive_fraction=0.25, pos_strategy="instance_balanced", neg_strategy="iou_balanced", )
[docs] def get_default_box_matcher() -> MaxIoUMatcher: """Get default box matcher of QD-3DT bounding box 3D head.""" return MaxIoUMatcher( thresholds=[0.5, 0.5], labels=[0, -1, 1], allow_low_quality_matches=False, )
[docs] def get_default_box_codec( center_scale: float = 10.0, depth_log_scale: float = 2.0, dim_log_scale: float = 2.0, num_rotation_bins: int = 2, bin_overlap: float = 1 / 6, ) -> tuple[QD3DTBox3DEncoder, QD3DTBox3DDecoder]: """Get the default bounding box encoder and decoder.""" return ( QD3DTBox3DEncoder( center_scale=center_scale, depth_log_scale=depth_log_scale, dim_log_scale=dim_log_scale, num_rotation_bins=num_rotation_bins, bin_overlap=bin_overlap, ), QD3DTBox3DDecoder( center_scale=center_scale, depth_log_scale=depth_log_scale, dim_log_scale=dim_log_scale, num_rotation_bins=num_rotation_bins, ), )
[docs] class QD3DTBBox3DHead(nn.Module): """This class implements the QD-3DT bounding box 3D head.""" def __init__( # pylint: disable=too-many-arguments self, num_classes: int, proposal_pooler: None | RoIPooler = None, box_matcher: None | Matcher = None, box_sampler: None | Sampler = None, box_encoder: None | QD3DTBox3DEncoder = None, proposal_append_gt: bool = True, num_shared_convs: int = 2, num_shared_fcs: int = 0, num_dep_convs: int = 4, num_dep_fcs: int = 0, num_dim_convs: int = 4, num_dim_fcs: int = 0, num_rot_convs: int = 4, num_rot_fcs: int = 0, num_cen_2d_convs: int = 4, num_cen_2d_fcs: int = 0, in_channels: int = 256, conv_out_dim: int = 256, fc_out_dim: int = 1024, roi_feat_size: int = 7, conv_has_bias: bool = True, norm: None | str = None, num_groups: int = 32, num_rotation_bins: int = 2, start_level: int = 2, ): """Initialize the QD-3DT bounding box 3D head.""" super().__init__() self.proposal_pooler = ( proposal_pooler if proposal_pooler is not None else get_default_proposal_pooler() ) self.box_matcher = ( box_matcher if box_matcher is not None else get_default_box_matcher() ) self.box_sampler = ( box_sampler if box_sampler is not None else get_default_box_sampler() ) self.box_encoder = ( box_encoder if box_encoder is not None else QD3DTBox3DEncoder() ) self.num_shared_convs = num_shared_convs self.num_shared_fcs = num_shared_fcs self.num_rotation_bins = num_rotation_bins self.proposal_append_gt = proposal_append_gt self.cls_out_channels = num_classes # Used feature layers are [start_level, end_level) self.start_level = start_level num_strides = len(self.proposal_pooler.scales) self.end_level = start_level + num_strides # add shared convs and fcs ( self.shared_convs, self.shared_fcs, self.shared_out_channels, ) = self._add_conv_fc_branch( num_shared_convs, num_shared_fcs, in_channels, conv_out_dim, fc_out_dim, conv_has_bias, norm, num_groups, True, ) # add depth specific branch ( self.dep_convs, self.dep_fcs, self.dep_last_dim, ) = self._add_conv_fc_branch( num_dep_convs, num_dep_fcs, self.shared_out_channels, conv_out_dim, fc_out_dim, conv_has_bias, norm, num_groups, ) # add dim specific branch ( self.dim_convs, self.dim_fcs, self.dim_last_dim, ) = self._add_conv_fc_branch( num_dim_convs, num_dim_fcs, self.shared_out_channels, conv_out_dim, fc_out_dim, conv_has_bias, norm, num_groups, ) # add rot specific branch ( self.rot_convs, self.rot_fcs, self.rot_last_dim, ) = self._add_conv_fc_branch( num_rot_convs, num_rot_fcs, self.shared_out_channels, conv_out_dim, fc_out_dim, conv_has_bias, norm, num_groups, ) # add delta 2D center specific branch ( self.cen_2d_convs, self.cen_2d_fcs, self.cen_2d_last_dim, ) = self._add_conv_fc_branch( num_cen_2d_convs, num_cen_2d_fcs, self.shared_out_channels, conv_out_dim, fc_out_dim, conv_has_bias, norm, num_groups, ) if num_shared_fcs == 0: if num_dep_fcs == 0: self.dep_last_dim *= roi_feat_size * roi_feat_size if num_dim_fcs == 0: self.dim_last_dim *= roi_feat_size * roi_feat_size if num_rot_fcs == 0: self.rot_last_dim *= roi_feat_size * roi_feat_size if num_cen_2d_fcs == 0: self.cen_2d_last_dim *= roi_feat_size * roi_feat_size self.relu = nn.ReLU(inplace=True) # reconstruct fc_cls and fc_reg since input channels are changed out_dim_dep = self.cls_out_channels self.fc_dep = nn.Linear(self.dep_last_dim, out_dim_dep) self.fc_dep_uncer = nn.Linear(self.dep_last_dim, out_dim_dep) out_dim_size = 3 * self.cls_out_channels self.fc_dim = nn.Linear(self.dim_last_dim, out_dim_size) out_rot_size = 3 * num_rotation_bins * self.cls_out_channels self.fc_rot = nn.Linear(self.rot_last_dim, out_rot_size) out_cen_2d_size = 2 * self.cls_out_channels self.fc_cen_2d = nn.Linear(self.cen_2d_last_dim, out_cen_2d_size) self._init_weights() def _init_weights(self) -> None: """Init weights of modules in head.""" module_lists: list[nn.ModuleList | nn.Linear | Conv2d] = [] module_lists += [self.shared_convs] module_lists += [self.shared_fcs] module_lists += [self.dep_convs] module_lists += [self.fc_dep_uncer] module_lists += [self.fc_dep, self.dep_fcs] module_lists += [self.dim_convs] module_lists += [self.fc_dim, self.dim_fcs] module_lists += [self.rot_convs] module_lists += [self.fc_rot, self.rot_fcs] module_lists += [self.cen_2d_convs] module_lists += [self.fc_cen_2d, self.cen_2d_fcs] for module_list in module_lists: for m in module_list.modules(): if isinstance(m, nn.Linear): xavier_init(m, distribution="uniform") elif isinstance(m, Conv2d): kaiming_init(m) def _add_conv_fc_branch( self, num_branch_convs: int, num_branch_fcs: int, in_channels: int, conv_out_dim: int, fc_out_dim: int, conv_has_bias: bool, norm: None | str, num_groups: int, is_shared: bool = False, ) -> tuple[nn.ModuleList, nn.ModuleList, int]: """Init modules of head.""" convs, last_layer_dim = add_conv_branch( num_branch_convs, in_channels, conv_out_dim, conv_has_bias, norm, num_groups, ) fcs = nn.ModuleList() if num_branch_fcs > 0: if is_shared or num_branch_fcs == 0: last_layer_dim *= int(np.prod(self.proposal_pooler.resolution)) for i in range(num_branch_fcs): fc_in_dim = last_layer_dim if i == 0 else fc_out_dim fcs.append( nn.Sequential( nn.Linear(fc_in_dim, fc_out_dim), nn.ReLU(inplace=True), ) ) last_layer_dim = fc_out_dim return convs, fcs, last_layer_dim
[docs] def get_embeds( self, feat: Tensor ) -> tuple[Tensor, Tensor, Tensor, Tensor]: """Generate embedding from bbox feature.""" # shared part if self.num_shared_convs > 0: for conv in self.shared_convs: feat = conv(feat) if self.num_shared_fcs > 0: feat = feat.view(feat.size(0), -1) for fc in self.shared_fcs: feat = self.relu(fc(feat)) # separate branches x_dep = feat x_dim = feat x_rot = feat x_cen_2d = feat for conv in self.dep_convs: x_dep = conv(x_dep) if x_dep.dim() > 2: x_dep = x_dep.view(x_dep.size(0), -1) for fc in self.dep_fcs: x_dep = self.relu(fc(x_dep)) for conv in self.dim_convs: x_dim = conv(x_dim) if x_dim.dim() > 2: x_dim = x_dim.view(x_dim.size(0), -1) for fc in self.dim_fcs: x_dim = self.relu(fc(x_dim)) for conv in self.rot_convs: x_rot = conv(x_rot) if x_rot.dim() > 2: x_rot = x_rot.view(x_rot.size(0), -1) for fc in self.rot_fcs: x_rot = self.relu(fc(x_rot)) for conv in self.cen_2d_convs: x_cen_2d = conv(x_cen_2d) if x_cen_2d.dim() > 2: x_cen_2d = x_cen_2d.view(x_cen_2d.size(0), -1) for fc in self.cen_2d_fcs: x_cen_2d = self.relu(fc(x_cen_2d)) return x_dep, x_dim, x_rot, x_cen_2d
[docs] def get_outputs( self, x_dep: Tensor, x_dim: Tensor, x_rot: Tensor, x_cen_2d: Tensor ) -> Tensor: """Generate output 3D bounding box parameters.""" depth = self.fc_dep(x_dep).view(-1, self.cls_out_channels, 1) depth_uncertainty = self.fc_dep_uncer(x_dep).view( -1, self.cls_out_channels, 1 ) dim = self.fc_dim(x_dim).view(-1, self.cls_out_channels, 3) alpha = generate_rotation_output( self.fc_rot(x_rot), self.num_rotation_bins ) delta_cen_2d = self.fc_cen_2d(x_cen_2d).view( -1, self.cls_out_channels, 2 ) return torch.cat( [delta_cen_2d, depth, dim, alpha, depth_uncertainty], -1 )
[docs] def get_predictions( self, features: list[Tensor], boxes_2d: list[Tensor] ) -> list[Tensor]: """Get 3D bounding box prediction parameters.""" if sum(len(b) for b in boxes_2d) == 0: # pragma: no cover return [ torch.empty( ( 0, self.cls_out_channels, 6 + 3 * self.num_rotation_bins + 1, ), device=boxes_2d[0].device, ) ] * len(boxes_2d) roi_feats = self.proposal_pooler( features[self.start_level : self.end_level], boxes_2d ) x_dep, x_dim, x_rot, x_cen_2d = self.get_embeds(roi_feats) outputs: list[Tensor] = list( self.get_outputs(x_dep, x_dim, x_rot, x_cen_2d).split( [len(b) for b in boxes_2d] ) ) return outputs
[docs] def get_targets( self, pos_assigned_gt_inds: list[Tensor], target_boxes: list[Tensor], target_boxes3d: list[Tensor], target_class_ids: list[Tensor], intrinsics: Tensor, ) -> tuple[Tensor, Tensor]: """Get 3D bounding box targets for training.""" targets = [] labels = [] for i, (tgt_boxes, tgt_boxes3d, intrinsics_) in enumerate( zip(target_boxes, target_boxes3d, intrinsics) ): bbox_target = self.box_encoder(tgt_boxes, tgt_boxes3d, intrinsics_) targets.append(bbox_target[pos_assigned_gt_inds[i]]) labels.append(target_class_ids[i][pos_assigned_gt_inds[i]]) return torch.cat(targets), torch.cat(labels)
[docs] def forward( self, features: list[Tensor], det_boxes: list[Tensor], intrinsics: Tensor | None = None, target_boxes: list[Tensor] | None = None, target_boxes3d: list[Tensor] | None = None, target_class_ids: list[Tensor] | None = None, ) -> QD3DTBBox3DHeadOutput: """Forward.""" if ( intrinsics is not None and target_boxes is not None and target_boxes3d is not None and target_class_ids is not None ): if self.proposal_append_gt: det_boxes = [ torch.cat([d, t]) for d, t in zip(det_boxes, target_boxes) ] ( sampled_box_indices, sampled_target_indices, sampled_labels, ) = match_and_sample_proposals( self.box_matcher, self.box_sampler, det_boxes, target_boxes ) positives = [torch.eq(l, 1) for l in sampled_labels] pos_assigned_gt_inds = [ i[p] if len(p) != 0 else p for i, p in zip(sampled_target_indices, positives) ] pos_boxes = [ b[s_i][p] for b, s_i, p in zip(det_boxes, sampled_box_indices, positives) ] predictions = self.get_predictions(features, pos_boxes) targets, labels = self.get_targets( pos_assigned_gt_inds, target_boxes, target_boxes3d, target_class_ids, intrinsics, ) return QD3DTBBox3DHeadOutput( predictions=predictions, targets=targets, labels=labels ) predictions = self.get_predictions(features, det_boxes) return QD3DTBBox3DHeadOutput(predictions, None, None)
[docs] def __call__( self, features: list[Tensor], det_boxes: list[Tensor], intrinsics: Tensor | None = None, target_boxes: list[Tensor] | None = None, target_boxes3d: list[Tensor] | None = None, target_class_ids: list[Tensor] | None = None, ) -> QD3DTBBox3DHeadOutput: """Type definition.""" return self._call_impl( features, det_boxes, intrinsics, target_boxes, target_boxes3d, target_class_ids, )
[docs] class RoI2Det3D: """Post processing for QD3DTBBox3DHead.""" def __init__(self, box_decoder: None | QD3DTBox3DDecoder = None) -> None: """Initialize.""" self.box_decoder = ( QD3DTBox3DDecoder() if box_decoder is None else box_decoder )
[docs] def __call__( self, predictions: list[Tensor], boxes_2d: list[Tensor], class_ids: list[Tensor], intrinsics: Tensor, ) -> QD3DTDet3DOut: """Forward pass during testing stage. Args: predictions(list[Tensor]): Predictions. boxes_2d(list[Tensor]): 2D boxes. class_ids(list[Tensor]): Class IDs. intrinsics(Tensor): Camera intrinsics. Returns: QD3DTDet3DOut: QD3DT 3D detection output. """ boxes_3d = [] depth_uncertainty = [] device = boxes_2d[0].device for _boxes_2d, _class_ids, _boxes_deltas, _intrinsics in zip( boxes_2d, class_ids, predictions, intrinsics ): if len(_boxes_2d) == 0: boxes_3d.append(torch.empty(0, 12).to(device)) depth_uncertainty.append(torch.empty(0).to(device)) continue _boxes_deltas = _boxes_deltas[ torch.arange(_boxes_deltas.shape[0]), _class_ids ] depth_uncertainty.append( _boxes_deltas[:, -1].clamp(min=0.0, max=1.0) ) boxes_3d.append( self.box_decoder(_boxes_2d, _boxes_deltas, _intrinsics) ) return QD3DTDet3DOut( boxes_3d=boxes_3d, depth_uncertainty=depth_uncertainty )
[docs] class Box3DUncertaintyLoss(Loss): """Box3d loss for QD-3DT.""" def __init__( self, reducer: LossReducer = mean_loss, center_loss_weight: float = 1.0, depth_loss_weight: float = 1.0, dimension_loss_weight: float = 1.0, rotation_loss_weight: float = 1.0, uncertainty_loss_weight: float = 1.0, num_rotation_bins: int = 2, ) -> None: """Creates an instance of the class. Args: reducer (LossReducer): Reducer for the loss function. center_loss_weight (float): Weight for center loss. depth_loss_weight (float): Weight for depth loss. dimension_loss_weight (float): Weight for dimension loss. rotation_loss_weight (float): Weight for rotation loss. uncertainty_loss_weight (float): Weight for uncertainty loss. num_rotation_bins (int): Number of rotation bins. """ super().__init__(reducer) self.center_loss_weight = center_loss_weight self.depth_loss_weight = depth_loss_weight self.dimension_loss_weight = dimension_loss_weight self.rotation_loss_weight = rotation_loss_weight self.uncertainty_loss_weight = uncertainty_loss_weight self.num_rotation_bins = num_rotation_bins
[docs] def forward( self, pred: Tensor, target: Tensor, labels: Tensor ) -> LossesType: """Compute box3d loss. Args: pred (Tensor): Box predictions of shape [N, num_classes, 6 + 3 * num_rotations_bins]. target (torcch.Tensor): Target boxes of shape [N, 6 + num_rotation_bins]. labels (Tensor): Target Labels of shape [N]. Returns: dict[str, Tensor] containing 'delta 2dc', 'dimension', 'depth', 'rotation' and 'uncertainty' loss. """ if pred.size(0) == 0: loss_ctr3d = loss_dep3d = loss_dim3d = loss_rot3d = loss_conf3d = ( pred.sum() * 0 ) result_dict = { "loss_ctr3d": loss_ctr3d, "loss_dep3d": loss_dep3d, "loss_dim3d": loss_dim3d, "loss_rot3d": loss_rot3d, "loss_conf3d": loss_conf3d, } return result_dict pred = pred[torch.arange(pred.shape[0], device=pred.device), labels] # delta 2dc loss loss_cen = smooth_l1_loss( pred[:, :2], target[:, :2], reducer=self.reducer, beta=1 / 9 ) # dimension loss dim_mask = target[:, 3:6] != 100.0 loss_dim = smooth_l1_loss( pred[:, 3:6][dim_mask], target[:, 3:6][dim_mask], reducer=self.reducer, beta=1 / 9, ) # depth loss depth_mask = target[:, 2] > 0 loss_dep = smooth_l1_loss( pred[:, 2][depth_mask], target[:, 2][depth_mask], reducer=self.reducer, beta=1 / 9, ) # rotation loss loss_rot = rotation_loss( pred[:, 6 : 6 + self.num_rotation_bins * 3], target[:, 6 : 6 + self.num_rotation_bins], target[:, 6 + self.num_rotation_bins :], self.num_rotation_bins, reducer=self.reducer, ) # uncertainty loss pos_depth_self_labels = torch.exp( -torch.mul(torch.abs(pred[:, 2] - target[:, 2]), 5.0) ) pos_depth_self_weights = torch.where( pos_depth_self_labels > 0.8, pos_depth_self_labels.new_ones(1) * 5.0, pos_depth_self_labels.new_ones(1) * 0.1, ) loss_unc3d = smooth_l1_loss( pred[:, -1], pos_depth_self_labels.detach().clone(), reducer=SumWeightedLoss( pos_depth_self_weights, len(pos_depth_self_weights) ), beta=1 / 9, ) return { "loss_ctr3d": torch.mul(self.center_loss_weight, loss_cen), "loss_dep3d": torch.mul(self.depth_loss_weight, loss_dep), "loss_dim3d": torch.mul(self.dimension_loss_weight, loss_dim), "loss_rot3d": torch.mul(self.rotation_loss_weight, loss_rot), "loss_unc3d": torch.mul(self.uncertainty_loss_weight, loss_unc3d), }