"""Utility functions for image processing operations."""

from __future__ import annotations

import numpy as np
import torch

from vis4d.common.array import array_to_numpy
from vis4d.common.typing import (
from import AxisMode
from import (
from vis4d.op.geometry.projection import project_points
from vis4d.op.geometry.transform import inverse_rigid_transform
from vis4d.vis.util import DEFAULT_COLOR_MAPPING

def _get_box_label(
    class_id: int | None,
    score: float | None,
    track_id: int | None,
    class_id_mapping: dict[int, str] | None = None,
) -> str:
    """Gets a unique string representation for a box definition.

        class_id (int): The class id for this box
        score (float): The confidence score
        track_id (int): The track id
        class_id_mapping (dict[int,str]): Mapping of class_id to class name

        str: Label for this box of format
            'class_name, track_id, score%'
    labels = []
    if class_id_mapping is None:
        class_id_mapping = {}

    if class_id is not None:
        labels.append(class_id_mapping.get(class_id, str(class_id)))
    if track_id is not None:
    if score is not None:
        labels.append(f"{score * 100:.1f}%")
    return ", ".join(labels)

def _to_binary_mask(
    mask: NDArrayUI8, ignore_class: int = 255
) -> tuple[NDArrayUI8, NDArrayUI8]:
    """Converts a mask to binary masks.

        mask (NDArrayUI8): The mask to convert with shape [H, W].
        ignore_class (int): The class id to ignore. Defaults to 255.

        NDArrayUI8: The binary masks with shape [N, H, W].
        NDArrayUI8: The class ids for each binary mask.
    binary_masks = []
    class_ids = []
    for class_id in np.unique(mask):
        if class_id == ignore_class:
        binary_masks.append(mask == class_id)
    return np.stack(binary_masks, axis=0), np.array(class_ids, dtype=np.uint8)

[docs] def preprocess_boxes( boxes: ArrayLikeFloat, scores: None | ArrayLikeFloat = None, class_ids: None | ArrayLikeInt = None, track_ids: None | ArrayLikeInt = None, color_palette: list[tuple[int, int, int]] = DEFAULT_COLOR_MAPPING, class_id_mapping: dict[int, str] | None = None, default_color: tuple[int, int, int] = (255, 0, 0), ) -> tuple[ list[tuple[float, float, float, float]], list[str], list[tuple[int, int, int]], ]: """Preprocesses bounding boxes. Converts the given predicted bounding boxes and class/track information into lists of corners, labels and colors. Args: boxes (ArrayLikeFloat): Boxes of shape [N, 4] where N is the number of boxes and the second channel consists of (x1,y1,x2,y2) box coordinates. scores (ArrayLikeFloat): Scores for each box shape [N] class_ids (ArrayLikeInt): Class id for each box shape [N] track_ids (ArrayLikeInt): Track id for each box shape [N] color_palette (list[tuple[float, float, float]]): Color palette for each id. class_id_mapping(dict[int, str], optional): Mapping from class id to color tuple (0-255). default_color (tuple[int, int, int]): fallback color for boxes of no class or track id is given. Returns: boxes_proc (list[tuple[float, float, float, float]]): List of box corners. labels_proc (list[str]): List of labels. colors_proc (list[tuple[int, int, int]]): List of colors. """ if class_id_mapping is None: class_id_mapping = {} boxes = array_to_numpy(boxes, n_dims=2, dtype=np.float32) scores_np = array_to_numpy(scores, n_dims=1, dtype=np.float32) class_ids_np = array_to_numpy(class_ids, n_dims=1, dtype=np.int32) track_ids_np = array_to_numpy(track_ids, n_dims=1, dtype=np.int32) boxes_proc: list[tuple[float, float, float, float]] = [] colors_proc: list[tuple[int, int, int]] = [] labels_proc: list[str] = [] # Only one box provided if len(boxes.shape) == 1: # unsqueeze one dimension boxes = boxes.reshape(1, -1) for idx in range(boxes.shape[0]): class_id = None if class_ids_np is None else class_ids_np[idx].item() score = None if scores_np is None else scores_np[idx].item() track_id = None if track_ids_np is None else track_ids_np[idx].item() if track_id is not None: color = color_palette[track_id % len(color_palette)] elif class_id is not None: color = color_palette[class_id % len(color_palette)] else: color = default_color boxes_proc.append( ( boxes[idx][0].item(), boxes[idx][1].item(), boxes[idx][2].item(), boxes[idx][3].item(), ) ) colors_proc.append(color) labels_proc.append( _get_box_label(class_id, score, track_id, class_id_mapping) ) return boxes_proc, labels_proc, colors_proc
[docs] def preprocess_boxes3d( image_hw: tuple[int, int], boxes3d: ArrayLikeFloat, intrinsics: ArrayLikeFloat, extrinsics: ArrayLikeFloat | None = None, scores: None | ArrayLikeFloat = None, class_ids: None | ArrayLikeInt = None, track_ids: None | ArrayLikeInt = None, color_palette: list[tuple[int, int, int]] = DEFAULT_COLOR_MAPPING, class_id_mapping: dict[int, str] | None = None, default_color: tuple[int, int, int] = (255, 0, 0), axis_mode: AxisMode = AxisMode.OPENCV, ) -> tuple[ list[tuple[float, float, float]], list[list[tuple[float, float, float]]], list[str], list[tuple[int, int, int]], list[int], ]: """Preprocesses bounding boxes. Converts the given predicted bounding boxes and class/track information into lists of centers, corners, labels, colors and track_ids. """ if class_id_mapping is None: class_id_mapping = {} boxes3d = array_to_numpy(boxes3d, n_dims=2, dtype=np.float32) intrinsics = array_to_numpy(intrinsics, n_dims=2, dtype=np.float32) boxes3d = torch.from_numpy(boxes3d) intrinsics = torch.from_numpy(intrinsics) if axis_mode != AxisMode.OPENCV: assert ( extrinsics is not None ), "extrinsics must be provided to move boxes to camera coordiante." extrinsics = array_to_numpy(extrinsics, n_dims=2, dtype=np.float32) extrinsics = torch.from_numpy(extrinsics) global_to_cam = inverse_rigid_transform(extrinsics) boxes3d_cam = transform_boxes3d( boxes3d, global_to_cam, source_axis_mode=AxisMode.ROS, target_axis_mode=AxisMode.OPENCV, ) else: boxes3d_cam = boxes3d corners = boxes3d_to_corners(boxes3d_cam, axis_mode=AxisMode.OPENCV) mask = boxes3d_in_image(corners, intrinsics, image_hw) boxes3d_np = boxes3d.numpy() corners_np = corners.numpy() scores_np = array_to_numpy(scores, n_dims=1, dtype=np.float32) class_ids_np = array_to_numpy(class_ids, n_dims=1, dtype=np.int32) track_ids_np = array_to_numpy(track_ids, n_dims=1, dtype=np.int32) boxes3d_np = boxes3d_np[mask] corners_np = corners_np[mask] scores_np = scores_np[mask] if scores_np is not None else None class_ids_np = class_ids_np[mask] if class_ids_np is not None else None track_ids_np = track_ids_np[mask] if track_ids_np is not None else None centers_proc: list[tuple[float, float, float]] = [] corners_proc: list[list[tuple[float, float, float]]] = [] colors_proc: list[tuple[int, int, int]] = [] labels_proc: list[str] = [] track_ids_proc: list[int] = [] for idx in range(corners_np.shape[0]): class_id = None if class_ids_np is None else class_ids_np[idx].item() score = None if scores_np is None else scores_np[idx].item() track_id = None if track_ids_np is None else track_ids_np[idx].item() if track_id is not None: color = color_palette[track_id % len(color_palette)] elif class_id is not None: color = color_palette[class_id % len(color_palette)] else: color = default_color centers_proc.append( ( boxes3d_np[idx][0].item(), boxes3d_np[idx][1].item(), boxes3d_np[idx][2].item(), ) ) corners_proc.append([tuple(pts) for pts in corners_np[idx].tolist()]) colors_proc.append(color) labels_proc.append( _get_box_label(class_id, score, track_id, class_id_mapping) ) track_ids_proc.append(track_id) return centers_proc, corners_proc, labels_proc, colors_proc, track_ids_proc
[docs] def preprocess_masks( masks: ArrayLikeUInt, class_ids: ArrayLikeInt | None = None, color_mapping: list[tuple[int, int, int]] = DEFAULT_COLOR_MAPPING, ) -> tuple[list[NDArrayBool], list[tuple[int, int, int]]]: """Preprocesses predicted semantic or instance segmentation masks. Args: masks (ArrayLikeUInt): Masks of shape [H, W] or [N, H, W]. If the masks are of shape [H, W], they are assumed to be semantic segmentation masks, i.e. each pixel contains the class id. If the masks are of shape [N, H, W], they are assumed to be the binary masks of N instances. class_ids (ArrayLikeInt, None): An array with class ids for each mask shape [N]. If None, then the masks must be semantic segmentation masks and the class ids are extracted from the masks. color_mapping (list[tuple[int, int, int]]): Color mapping for each class. Returns: tuple[list[masks], list[colors]]: Returns a list with all masks of shape [H, W] as well as a list with the corresponding colors. Raises: ValueError: If the masks have an invalid shape. """ masks_np: NDArrayUI8 = array_to_numpy( # type: ignore masks, n_dims=None, dtype=np.uint8 ) if len(masks_np.shape) == 2: masks_np, class_ids = _to_binary_mask(masks_np) elif len(masks_np.shape) == 3: if class_ids is not None: class_ids = array_to_numpy(class_ids, n_dims=1, dtype=np.int32) else: raise ValueError( f"Expected masks to have 2 or 3 dimensions, but got " f"{len(masks_np.shape)}" ) masks_binary = masks_np.astype(bool) mask_list: list[NDArrayBool] = [] color_list: list[tuple[int, int, int]] = [] for idx in range(masks_binary.shape[0]): mask = masks_binary[idx, ...] class_id = None if class_ids is None else class_ids[idx].item() if class_id is not None: color = color_mapping[class_id % len(color_mapping)] else: color = color_mapping[idx % len(color_mapping)] mask_list.append(mask) color_list.append(color) return mask_list, color_list
[docs] def preprocess_image(image: ArrayLike, mode: str = "RGB") -> NDArrayUI8: """Validate and convert input image. Args: image: CHW or HWC image (ArrayLike) with C = 3. mode: input channel format (e.g. BGR, HSV). Returns: np.array[uint8]: Processed image_np in RGB. """ image_np = array_to_numpy(image, n_dims=3, dtype=np.float32) # Convert torch to numpy assert len(image_np.shape) == 3 assert image_np.shape[0] == 3 or image_np.shape[-1] == 3 # Convert torch to numpy convention if not image_np.shape[-1] == 3: image_np = np.transpose(image_np, (1, 2, 0)) # type: ignore # Convert image_np to [0, 255] min_val, max_val = ( np.min(image_np, axis=(0, 1)), np.max(image_np, axis=(0, 1)), ) image_np = image_np.astype(np.float32) image_np = (image_np - min_val) / (max_val - min_val) * 255.0 if mode == "BGR": image_np = image_np[..., [2, 1, 0]] return image_np.astype(np.uint8)
[docs] def get_intersection_point( point1: tuple[float, float, float], point2: tuple[float, float, float], camera_near_clip: float, ) -> tuple[float, float, float]: """Get point intersecting with camera near plane on line point1 -> point2. The line is defined by two points in camera coordinates and their depth. Args: point1 (tuple[float x 3]): First point in camera coordinates. point2 (tuple[float x 3]): Second point in camera coordinates camera_near_clip (float): camera_near_clip Returns: tuple[float, float, float]: The intersection point in camera coordiantes. """ c1, c2, c3 = 0, 0, camera_near_clip a1, a2, a3 = 0, 0, 1 x1, y1, z1 = point1 x2, y2, z2 = point2 k_up = abs(a1 * (x1 - c1) + a2 * (y1 - c2) + a3 * (z1 - c3)) k_down = abs(a1 * (x1 - x2) + a2 * (y1 - y2) + a3 * (z1 - z2)) if k_up > k_down: k = 1.0 else: k = k_up / k_down return ((1 - k) * x1 + k * x2, (1 - k) * y1 + k * y2, camera_near_clip)
[docs] def project_point( point: tuple[float, float, float], intrinsics: NDArrayF32 ) -> tuple[float, float]: """Project single point into the image plane.""" projected_x, projected_y = ( project_points( torch.from_numpy(np.array([point], dtype=np.float32)), torch.from_numpy(intrinsics), ) .squeeze(0) .numpy() .tolist() ) return projected_x, projected_y