Source code for vis4d.op.base.pointnet

"""Operations for PointNet.

Code taken from
and modified to allow for modular configuration.

from __future__ import annotations

from import Callable, Iterable
from typing import NamedTuple

import torch
from torch import nn

from vis4d.common.typing import ArgsType

[docs] class PointNetEncoderOut(NamedTuple): """Output of the PointNetEncoder. features: Global features shape [N, feature_dim] pointwise Features: Pointwise features shape [N, last_mlp_dim, n_pts] transformations: list with all transformation matrixes that were used. Shape [N, d, d] """ features: torch.Tensor pointwise_features: torch.Tensor # transformations: list[ # list with all transformation matrices [[B, d, d]] torch.Tensor ]
[docs] class PointNetSemanticsLoss(NamedTuple): """Losses for the pointnet semantic segmentation network.""" semantic_loss: torch.Tensor regularization_loss: torch.Tensor
[docs] class PointNetSemanticsOut(NamedTuple): """Output of the PointNet Segmentation network.""" class_logits: torch.Tensor # B, n_classes, n_pts transformations: list[ # list with all transformation matrices [[B, d, d]] torch.Tensor ]
[docs] class LinearTransform(nn.Module): """Module that learns a linear transformation for a input pointcloud. Code taken from and modified to allow for modular configuration. See T-Net in Pointnet publication ( for more information """ def __init__( self, in_dimension: int = 3, upsampling_dims: Iterable[int] = (64, 128, 1024), downsampling_dims: Iterable[int] = (1024, 512, 256), norm_cls: str | None = "BatchNorm1d", activation_cls: str = "ReLU", ) -> None: """Creates a new LinearTransform. This learns a transformation matrix from data. Args: in_dimension (int): input dimension upsampling_dims (Iterable[int]): list of intermediate feature shapes for upsampling downsampling_dims (Iterable[int]): list of intermediate feature shapes for downsampling. Make sure this matches with the last upsampling_dims norm_cls (Optional(str)): class for norm (nn.'norm_cls') or None activation_cls (str): class for activation (nn.'activation_cls') """ super().__init__() self.upsampling_dims = list(upsampling_dims) self.downsampling_dims = list(downsampling_dims) assert ( len(self.upsampling_dims) != 0 and len(self.downsampling_dims) != 0 ) assert self.upsampling_dims[-1] == self.downsampling_dims[0] self.in_dimension_ = in_dimension self.identity: torch.Tensor self.register_buffer( "identity", torch.eye(in_dimension).reshape(1, in_dimension**2) ) # Create activation self.activation_ = getattr(nn, activation_cls)() # Create norms norm_fn: Callable[[int], nn.Module] = ( getattr(nn, norm_cls) if norm_cls is not None else None ) if norm_fn is not None: self.norms_ = nn.ModuleList( norm_fn(feature_size) for feature_size in ( *upsampling_dims, *self.downsampling_dims[1:], ) ) # Create upsampling layers self.upsampling_layers = nn.ModuleList( [nn.Conv1d(in_dimension, self.upsampling_dims[0], 1)] ) for i in range(len(self.upsampling_dims) - 1): self.upsampling_layers.append( nn.Conv1d( self.upsampling_dims[i], self.upsampling_dims[i + 1], 1 ) ) # Create downsampling layers self.downsampling_layers = nn.ModuleList( [ nn.Linear( self.downsampling_dims[i], self.downsampling_dims[i + 1] ) for i in range(len(self.downsampling_dims) - 1) ] ) self.downsampling_layers.append( nn.Linear(self.downsampling_dims[-1], in_dimension**2) )
[docs] def __call__( self, features: torch.Tensor, ) -> torch.Tensor: """Type definition for call implementation.""" return self._call_impl(features)
[docs] def forward( self, features: torch.Tensor, ) -> torch.Tensor: """Linear Transform forward. Args: features (Tensor[B, C, N]): Input features (e.g. points) Returns: Learned Canonical Transfomation Matrix for this input. See T-Net in Pointnet publication ( for further information """ batchsize = features.shape[0] # Upsample features for idx, layer in enumerate(self.upsampling_layers): features = layer(features) if self.norms_ is not None: features = self.norms_[idx](features) features = self.activation_(features) features = torch.max(features, 2, keepdim=True)[0] features = features.view(-1, self.upsampling_dims[-1]) # Downsample features for idx, layer in enumerate(self.downsampling_layers): features = layer(features) # Do not apply norm and activation for # final layer if idx != len(self.downsampling_layers) - 1: if self.norms_ is not None: norm_idx = idx + len(self.upsampling_layers) features = self.norms_[norm_idx](features) features = self.activation_(features) identity_batch = self.identity.repeat(batchsize, 1) transformations = features + identity_batch return transformations.view( batchsize, self.in_dimension_, self.in_dimension_ )
[docs] class PointNetEncoder(nn.Module): """PointNetEncoder. Encodes a pointcloud and additional features into one feature description See pointnet publication for more information ( """ def __init__( self, in_dimensions: int = 3, out_dimensions: int = 1024, mlp_dimensions: Iterable[Iterable[int]] = ((64, 64), (64, 128)), norm_cls: str | None = "BatchNorm1d", activation_cls: str = "ReLU", **kwargs: ArgsType, ): """Creates a new PointNetEncoder. Args: in_dimensions (int): input dimension (e.g. 3 for xzy, 6 for xzyrgb) out_dimensions (int): output dimensions mlp_dimensions (Iterable[Iterable[int]]):(Dimensions of MLP layers) norm_cls (Optional(str)): class for norm (nn.'norm_cls') or None activation_cls (str): class for activation (nn.'activation_cls') kwargs : See arguments of @LinearTransformStn """ super().__init__() self.out_dimension = out_dimensions # Extend dimensions to upscale from input dimension mlp_dim_list: list[list[int]] = [list(d) for d in mlp_dimensions] mlp_dim_list[0].insert(0, in_dimensions) mlp_dim_list[-1].append(out_dimensions) self.mlp_dimensions = mlp_dim_list # Learnable transformation layers. self.trans_layers_ = nn.ModuleList( [ LinearTransform( in_dimension=dims[0], norm_cls=norm_cls, activation_cls=activation_cls, **kwargs, ) for dims in mlp_dim_list ] ) # MLP layers self.mlp_layers_ = nn.ModuleList() # Create activation activation = getattr(nn, activation_cls)() # Create norms norm_fn: Callable[[int], nn.Module] = ( getattr(nn, norm_cls) if norm_cls is not None else None ) for mlp_idx, mlp_dims in enumerate(mlp_dim_list): layers: list[nn.Module] = [] for idx, (in_dim, out_dim) in enumerate( zip(mlp_dims[:-1], mlp_dims[1:]) ): # Create MLP layers.append(torch.nn.Conv1d(in_dim, out_dim, 1)) # Create BN if needed if norm_fn is not None: layers.append(norm_fn(out_dim)) # Only add activation if not last layer if ( mlp_idx != len(mlp_dim_list) - 1 and idx != len(mlp_dims) - 2 ): layers.append(activation) self.mlp_layers_.append(nn.Sequential(*layers))
[docs] def __call__(self, features: torch.Tensor) -> PointNetEncoderOut: """Type definition for call implementation.""" return self._call_impl(features)
[docs] def forward(self, features: torch.Tensor) -> PointNetEncoderOut: """Pointnet encoder forward. Args: features (Tensor[B, C, N]): Input features stacked in channels. e.g. raw point inputs: [B, 3, N] , w color : [B, 3+3, N], ... Returns: Extracted feature representation for input and all applied transformations. """ transforms: list[torch.Tensor] = [] for block_idx, trans_layer in enumerate(self.trans_layers_): # Apply transformation trans = trans_layer(features) transforms.append(trans) features = features.transpose(2, 1) features = torch.bmm(features, trans) features = features.transpose(2, 1) if block_idx == len(self.trans_layers_) - 1: pointwise_features = features.clone() # Apply MLP features = self.mlp_layers_[block_idx](features) features = torch.max(features, 2, keepdim=True)[0] features = features.view(-1, self.out_dimension) return PointNetEncoderOut( features=features, transformations=transforms, pointwise_features=pointwise_features, # pylint: disable=possibly-used-before-assignment, line-too-long )
[docs] class PointNetSegmentation(nn.Module): """Segmentation network using a simple pointnet as encoder.""" def __init__( self, n_classes: int, in_dimensions: int = 3, feature_dimension: int = 1024, norm_cls: str = "BatchNorm1d", activation_cls: str = "ReLU", ): """Creates a new Point Net segementation network. Args: n_classes (int): Number of semantic classes in_dimensions (int): Input dimension (3 for xyz, 6 xyzrgb, ...) feature_dimension (int): Size of feature from the encoder norm_cls (Optional(str)): class for norm (nn.'norm_cls') or None activation_cls (str): class for activation (nn.'activation_cls') Raises: ValueError: If dimensions are invalid """ super().__init__() self.in_dimensions = in_dimensions self.encoder = PointNetEncoder( in_dimensions=in_dimensions, out_dimensions=feature_dimension, norm_cls=norm_cls, activation_cls=activation_cls, ) pc_feat_dim = self.encoder.mlp_dimensions[-1][0] # Create activation activation = getattr(nn, activation_cls)() # Create norms norm_fn: Callable[[int], nn.Module] = ( getattr(nn, norm_cls) if norm_cls is not None else None ) self.classifier_dims = [feature_dimension + pc_feat_dim, 512, 256, 128] # Build Model self.classifier = nn.Sequential() for in_dim, out_dim in zip( self.classifier_dims[:-1], self.classifier_dims[1:] ): self.classifier.append(nn.Conv1d(in_dim, out_dim, 1)) if norm_fn is not None: self.classifier.append(norm_fn(out_dim)) self.classifier.append(activation) self.classifier.append( nn.Conv1d( out_dim, # pylint: disable=undefined-loop-variable n_classes, 1, ) )
[docs] def __call__(self, points: torch.Tensor) -> PointNetSemanticsOut: """Call function.""" return self._call_impl(points)
[docs] def forward(self, points: torch.Tensor) -> PointNetSemanticsOut: """Pointnet Segmenter Forward. Args: points (tensor) : inputs points dimension [B, in_dim, n_pts] Returns: Returns a list of tensors where the first element is the desired segmentation [B, n_classes, n_pts] and the other elements are the linear transformation matrices which have been used to transform the pointclouds @see LinearTransform """ assert points.size(-2) == self.in_dimensions n_pts = points.size(-1) bs = points.size(0) encoder_out = self.encoder(points) global_features = encoder_out.features.view(bs, -1, 1).repeat( 1, 1, n_pts ) x =[global_features, encoder_out.pointwise_features], 1) x = self.classifier(x) return PointNetSemanticsOut( class_logits=x, transformations=encoder_out.transformations )