"""Utility functions for image processing operations."""
from __future__ import annotations
import numpy as np
import torch
from vis4d.common.array import array_to_numpy
from vis4d.common.typing import (
ArrayLike,
ArrayLikeFloat,
ArrayLikeInt,
ArrayLikeUInt,
NDArrayBool,
NDArrayF32,
NDArrayUI8,
)
from vis4d.data.const import AxisMode
from vis4d.op.box.box3d import (
boxes3d_in_image,
boxes3d_to_corners,
transform_boxes3d,
)
from vis4d.op.geometry.projection import project_points
from vis4d.op.geometry.transform import inverse_rigid_transform
from vis4d.vis.util import DEFAULT_COLOR_MAPPING
def _get_box_label(
class_id: int | None,
score: float | None,
track_id: int | None,
class_id_mapping: dict[int, str] | None = None,
) -> str:
"""Gets a unique string representation for a box definition.
Args:
class_id (int): The class id for this box
score (float): The confidence score
track_id (int): The track id
class_id_mapping (dict[int,str]): Mapping of class_id to class name
Returns:
str: Label for this box of format
'class_name, track_id, score%'
"""
labels = []
if class_id_mapping is None:
class_id_mapping = {}
if class_id is not None:
labels.append(class_id_mapping.get(class_id, str(class_id)))
if track_id is not None:
labels.append(str(track_id))
if score is not None:
labels.append(f"{score * 100:.1f}%")
return ", ".join(labels)
def _to_binary_mask(
mask: NDArrayUI8, ignore_class: int = 255
) -> tuple[NDArrayUI8, NDArrayUI8]:
"""Converts a mask to binary masks.
Args:
mask (NDArrayUI8): The mask to convert with shape [H, W].
ignore_class (int): The class id to ignore. Defaults to 255.
Returns:
NDArrayUI8: The binary masks with shape [N, H, W].
NDArrayUI8: The class ids for each binary mask.
"""
binary_masks = []
class_ids = []
for class_id in np.unique(mask):
if class_id == ignore_class:
continue
binary_masks.append(mask == class_id)
class_ids.append(class_id)
return np.stack(binary_masks, axis=0), np.array(class_ids, dtype=np.uint8)
[docs]
def preprocess_boxes(
boxes: ArrayLikeFloat,
scores: None | ArrayLikeFloat = None,
class_ids: None | ArrayLikeInt = None,
track_ids: None | ArrayLikeInt = None,
color_palette: list[tuple[int, int, int]] = DEFAULT_COLOR_MAPPING,
class_id_mapping: dict[int, str] | None = None,
default_color: tuple[int, int, int] = (255, 0, 0),
) -> tuple[
list[tuple[float, float, float, float]],
list[str],
list[tuple[int, int, int]],
]:
"""Preprocesses bounding boxes.
Converts the given predicted bounding boxes and class/track information
into lists of corners, labels and colors.
Args:
boxes (ArrayLikeFloat): Boxes of shape [N, 4] where N is the number of
boxes and the second channel consists of
(x1,y1,x2,y2) box coordinates.
scores (ArrayLikeFloat): Scores for each box shape [N]
class_ids (ArrayLikeInt): Class id for each box shape [N]
track_ids (ArrayLikeInt): Track id for each box shape [N]
color_palette (list[tuple[float, float, float]]): Color palette for
each id.
class_id_mapping(dict[int, str], optional): Mapping from class id
to color tuple (0-255).
default_color (tuple[int, int, int]): fallback color for boxes of no
class or track id is given.
Returns:
boxes_proc (list[tuple[float, float, float, float]]): List of box
corners.
labels_proc (list[str]): List of labels.
colors_proc (list[tuple[int, int, int]]): List of colors.
"""
if class_id_mapping is None:
class_id_mapping = {}
boxes = array_to_numpy(boxes, n_dims=2, dtype=np.float32)
scores_np = array_to_numpy(scores, n_dims=1, dtype=np.float32)
class_ids_np = array_to_numpy(class_ids, n_dims=1, dtype=np.int32)
track_ids_np = array_to_numpy(track_ids, n_dims=1, dtype=np.int32)
boxes_proc: list[tuple[float, float, float, float]] = []
colors_proc: list[tuple[int, int, int]] = []
labels_proc: list[str] = []
# Only one box provided
if len(boxes.shape) == 1:
# unsqueeze one dimension
boxes = boxes.reshape(1, -1)
for idx in range(boxes.shape[0]):
class_id = None if class_ids_np is None else class_ids_np[idx].item()
score = None if scores_np is None else scores_np[idx].item()
track_id = None if track_ids_np is None else track_ids_np[idx].item()
if track_id is not None:
color = color_palette[track_id % len(color_palette)]
elif class_id is not None:
color = color_palette[class_id % len(color_palette)]
else:
color = default_color
boxes_proc.append(
(
boxes[idx][0].item(),
boxes[idx][1].item(),
boxes[idx][2].item(),
boxes[idx][3].item(),
)
)
colors_proc.append(color)
labels_proc.append(
_get_box_label(class_id, score, track_id, class_id_mapping)
)
return boxes_proc, labels_proc, colors_proc
[docs]
def preprocess_boxes3d(
image_hw: tuple[int, int],
boxes3d: ArrayLikeFloat,
intrinsics: ArrayLikeFloat,
extrinsics: ArrayLikeFloat | None = None,
scores: None | ArrayLikeFloat = None,
class_ids: None | ArrayLikeInt = None,
track_ids: None | ArrayLikeInt = None,
color_palette: list[tuple[int, int, int]] = DEFAULT_COLOR_MAPPING,
class_id_mapping: dict[int, str] | None = None,
default_color: tuple[int, int, int] = (255, 0, 0),
axis_mode: AxisMode = AxisMode.OPENCV,
) -> tuple[
list[tuple[float, float, float]],
list[list[tuple[float, float, float]]],
list[str],
list[tuple[int, int, int]],
list[int],
]:
"""Preprocesses bounding boxes.
Converts the given predicted bounding boxes and class/track information
into lists of centers, corners, labels, colors and track_ids.
"""
if class_id_mapping is None:
class_id_mapping = {}
boxes3d = array_to_numpy(boxes3d, n_dims=2, dtype=np.float32)
intrinsics = array_to_numpy(intrinsics, n_dims=2, dtype=np.float32)
boxes3d = torch.from_numpy(boxes3d)
intrinsics = torch.from_numpy(intrinsics)
if axis_mode != AxisMode.OPENCV:
assert (
extrinsics is not None
), "extrinsics must be provided to move boxes to camera coordiante."
extrinsics = array_to_numpy(extrinsics, n_dims=2, dtype=np.float32)
extrinsics = torch.from_numpy(extrinsics)
global_to_cam = inverse_rigid_transform(extrinsics)
boxes3d_cam = transform_boxes3d(
boxes3d,
global_to_cam,
source_axis_mode=AxisMode.ROS,
target_axis_mode=AxisMode.OPENCV,
)
else:
boxes3d_cam = boxes3d
corners = boxes3d_to_corners(boxes3d_cam, axis_mode=AxisMode.OPENCV)
mask = boxes3d_in_image(corners, intrinsics, image_hw)
boxes3d_np = boxes3d.numpy()
corners_np = corners.numpy()
scores_np = array_to_numpy(scores, n_dims=1, dtype=np.float32)
class_ids_np = array_to_numpy(class_ids, n_dims=1, dtype=np.int32)
track_ids_np = array_to_numpy(track_ids, n_dims=1, dtype=np.int32)
boxes3d_np = boxes3d_np[mask]
corners_np = corners_np[mask]
scores_np = scores_np[mask] if scores_np is not None else None
class_ids_np = class_ids_np[mask] if class_ids_np is not None else None
track_ids_np = track_ids_np[mask] if track_ids_np is not None else None
centers_proc: list[tuple[float, float, float]] = []
corners_proc: list[list[tuple[float, float, float]]] = []
colors_proc: list[tuple[int, int, int]] = []
labels_proc: list[str] = []
track_ids_proc: list[int] = []
for idx in range(corners_np.shape[0]):
class_id = None if class_ids_np is None else class_ids_np[idx].item()
score = None if scores_np is None else scores_np[idx].item()
track_id = None if track_ids_np is None else track_ids_np[idx].item()
if track_id is not None:
color = color_palette[track_id % len(color_palette)]
elif class_id is not None:
color = color_palette[class_id % len(color_palette)]
else:
color = default_color
centers_proc.append(
(
boxes3d_np[idx][0].item(),
boxes3d_np[idx][1].item(),
boxes3d_np[idx][2].item(),
)
)
corners_proc.append([tuple(pts) for pts in corners_np[idx].tolist()])
colors_proc.append(color)
labels_proc.append(
_get_box_label(class_id, score, track_id, class_id_mapping)
)
track_ids_proc.append(track_id)
return centers_proc, corners_proc, labels_proc, colors_proc, track_ids_proc
[docs]
def preprocess_masks(
masks: ArrayLikeUInt,
class_ids: ArrayLikeInt | None = None,
color_mapping: list[tuple[int, int, int]] = DEFAULT_COLOR_MAPPING,
) -> tuple[list[NDArrayBool], list[tuple[int, int, int]]]:
"""Preprocesses predicted semantic or instance segmentation masks.
Args:
masks (ArrayLikeUInt): Masks of shape [H, W] or [N, H, W]. If the
masks are of shape [H, W], they are assumed to be semantic
segmentation masks, i.e. each pixel contains the class id.
If the masks are of shape [N, H, W], they are assumed to be
the binary masks of N instances.
class_ids (ArrayLikeInt, None): An array with class ids for each mask
shape [N]. If None, then the masks must be semantic segmentation
masks and the class ids are extracted from the masks.
color_mapping (list[tuple[int, int, int]]): Color mapping for
each class.
Returns:
tuple[list[masks], list[colors]]: Returns a list with all masks of
shape [H, W] as well as a list with the corresponding colors.
Raises:
ValueError: If the masks have an invalid shape.
"""
masks_np: NDArrayUI8 = array_to_numpy( # type: ignore
masks, n_dims=None, dtype=np.uint8
)
if len(masks_np.shape) == 2:
masks_np, class_ids = _to_binary_mask(masks_np)
elif len(masks_np.shape) == 3:
if class_ids is not None:
class_ids = array_to_numpy(class_ids, n_dims=1, dtype=np.int32)
else:
raise ValueError(
f"Expected masks to have 2 or 3 dimensions, but got "
f"{len(masks_np.shape)}"
)
masks_binary = masks_np.astype(bool)
mask_list: list[NDArrayBool] = []
color_list: list[tuple[int, int, int]] = []
for idx in range(masks_binary.shape[0]):
mask = masks_binary[idx, ...]
class_id = None if class_ids is None else class_ids[idx].item()
if class_id is not None:
color = color_mapping[class_id % len(color_mapping)]
else:
color = color_mapping[idx % len(color_mapping)]
mask_list.append(mask)
color_list.append(color)
return mask_list, color_list
[docs]
def preprocess_image(image: ArrayLike, mode: str = "RGB") -> NDArrayUI8:
"""Validate and convert input image.
Args:
image: CHW or HWC image (ArrayLike) with C = 3.
mode: input channel format (e.g. BGR, HSV).
Returns:
np.array[uint8]: Processed image_np in RGB.
"""
image_np = array_to_numpy(image, n_dims=3, dtype=np.float32)
# Convert torch to numpy
assert len(image_np.shape) == 3
assert image_np.shape[0] == 3 or image_np.shape[-1] == 3
# Convert torch to numpy convention
if not image_np.shape[-1] == 3:
image_np = np.transpose(image_np, (1, 2, 0)) # type: ignore
# Convert image_np to [0, 255]
min_val, max_val = (
np.min(image_np, axis=(0, 1)),
np.max(image_np, axis=(0, 1)),
)
image_np = image_np.astype(np.float32)
image_np = (image_np - min_val) / (max_val - min_val) * 255.0
if mode == "BGR":
image_np = image_np[..., [2, 1, 0]]
return image_np.astype(np.uint8)
[docs]
def get_intersection_point(
point1: tuple[float, float, float],
point2: tuple[float, float, float],
camera_near_clip: float,
) -> tuple[float, float, float]:
"""Get point intersecting with camera near plane on line point1 -> point2.
The line is defined by two points in camera coordinates and their depth.
Args:
point1 (tuple[float x 3]): First point in camera coordinates.
point2 (tuple[float x 3]): Second point in camera coordinates
camera_near_clip (float): camera_near_clip
Returns:
tuple[float, float, float]: The intersection point in camera
coordiantes.
"""
c1, c2, c3 = 0, 0, camera_near_clip
a1, a2, a3 = 0, 0, 1
x1, y1, z1 = point1
x2, y2, z2 = point2
k_up = abs(a1 * (x1 - c1) + a2 * (y1 - c2) + a3 * (z1 - c3))
k_down = abs(a1 * (x1 - x2) + a2 * (y1 - y2) + a3 * (z1 - z2))
if k_up > k_down:
k = 1.0
else:
k = k_up / k_down
return ((1 - k) * x1 + k * x2, (1 - k) * y1 + k * y2, camera_near_clip)
[docs]
def project_point(
point: tuple[float, float, float], intrinsics: NDArrayF32
) -> tuple[float, float]:
"""Project single point into the image plane."""
projected_x, projected_y = (
project_points(
torch.from_numpy(np.array([point], dtype=np.float32)),
torch.from_numpy(intrinsics),
)
.squeeze(0)
.numpy()
.tolist()
)
return projected_x, projected_y