Source code for akida_models.detection.processing

#!/usr/bin/env python
# ******************************************************************************
# Copyright 2020 Brainchip Holdings Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
"""
Processing tools for YOLO data handling.
"""

__all__ = ["BoundingBox", "load_image", "preprocess_image", "decode_output", "create_yolo_targets"]


import numpy as np
import tensorflow as tf
from .data_utils import Coord
from .box_utils import compute_center_wh, compute_center_xy


[docs]class BoundingBox:
    """ Utility class to represent a bounding box.

    The box is defined by its top left corner (x1, y1), bottom right corner
    (x2, y2), label, score and classes.
    """

    def __init__(self, x1, y1, x2, y2, score=-1, classes=None):
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2
        self.label = -1
        self.score = score
        self.classes = classes

    def __repr__(self):
        return "<BoundingBox({}, {}, {}, {}, {}, {}, {})>\n".format(
            self.x1, self.x2, self.y1, self.y2, self.get_label(),
            self.get_score(), self.classes)

[docs]    def get_label(self):
        """ Returns the label for this bounding box.

        Returns:
            Index of the label as an integer.
        """
        if self.label == -1:
            self.label = tf.argmax(self.classes)
        return self.label

[docs]    def get_score(self):
        """ Returns the score for this bounding box.

        Returns:
            Confidence as a float.
        """
        if self.score == -1:
            self.score = self.classes[self.get_label()]
        return self.score

[docs]    def iou(self, other):
        """ Computes intersection over union ratio between this bounding box and
        another one.

        Args:
            other (BoundingBox): the other bounding box for IOU computation

        Returns:
            IOU value as a float
        """

        def _interval_overlap(interval_1, interval_2):
            x1, x2 = interval_1
            x3, x4 = interval_2
            x1, x2, x3, x4 = (tf.cast(x1, dtype=tf.float32),
                              tf.cast(x2, dtype=tf.float32),
                              tf.cast(x3, dtype=tf.float32),
                              tf.cast(x4, dtype=tf.float32))

            if x3 < x1:
                if x4 < x1:
                    return tf.constant(0, dtype=tf.float32)
                return tf.minimum(x2, x4) - x1
            if x2 < x3:
                return tf.constant(0, dtype=tf.float32)
            return tf.minimum(x2, x4) - x3

        intersect_w = _interval_overlap([self.x1, self.x2],
                                        [other.x1, other.x2])
        intersect_h = _interval_overlap([self.y1, self.y2],
                                        [other.y1, other.y2])

        intersect = intersect_w * intersect_h

        w1, h1 = self.x2 - self.x1, self.y2 - self.y1
        w2, h2 = other.x2 - other.x1, other.y2 - other.y1

        union = w1 * h1 + w2 * h2 - intersect

        return tf.cast(intersect, dtype=tf.float32) / tf.cast(union, dtype=tf.float32)


[docs]def load_image(image_path):
    """ Loads an image from a path.

    Args:
        image_path (string): full path of the image to load

    Returns:
        a Tensorflow image Tensor
    """
    raw_image = tf.io.read_file(image_path)
    return tf.image.decode_jpeg(raw_image, channels=3)


[docs]def preprocess_image(image_buffer, output_size):
    """ Preprocess an image for YOLO inference.

    Args:
        image_buffer (tf.Tensor): image to preprocess
        output_size (tuple): shape of the image after preprocessing

    Returns:
        A resized and normalized image as a Numpy array.
    """
    # Resize
    width = tf.constant(output_size[0])
    height = tf.constant(output_size[1])
    image = tf.compat.v1.image.resize(image_buffer, [height, width],
                                      method=tf.image.ResizeMethod.BILINEAR,
                                      align_corners=False)
    return image.numpy()


[docs]def create_yolo_targets(objects,
                        grid_size,
                        num_classes,
                        anchors):
    """
    Creates YOLO-style targets tensor for the given objects.

    Args:
        objects (dict): Dictionary containing information about objects in the image,
            including labels and bounding boxes.
        grid_size (tuple): The grid size used for YOLO target generation.
        num_classes (int): The number of classes.
        anchors (list): List of anchor boxes.

    Returns:
        targets (tf.Tensor): The targets output tensor.
    """
    def _update_bbox_target(bbox, grid_y, grid_x, best_anchor, targets):
        for i in range(4):
            indices_bbox = [[grid_y, grid_x, best_anchor, i]]
            targets = tf.tensor_scatter_nd_update(targets, indices_bbox, updates=[bbox[i]])
        return targets

    def _update_confidence_target(grid_y, grid_x, best_anchor, targets):
        indices_confidence = [[grid_y, grid_x, best_anchor, 4]]
        return tf.tensor_scatter_nd_update(targets, indices_confidence, updates=[1.])

    def _update_class_target(grid_y, grid_x, best_anchor, obj_indx, targets):
        indices_class = [[grid_y, grid_x, best_anchor, tf.cast(5 + obj_indx, tf.int32)]]
        return tf.tensor_scatter_nd_update(targets, indices_class, updates=[1.])

    n_anchors = len(anchors)
    anchors = [BoundingBox(0, 0, anchors[i][0], anchors[i][1]) for i in range(len(anchors))]
    targets = tf.zeros((grid_size[0], grid_size[1], n_anchors, 5 + num_classes),
                       dtype=tf.float32)
    num_objects = tf.shape(objects['label'])[0]

    for idx in range(num_objects):
        bbox = objects['bbox'][idx]
        if bbox[Coord.x2] > bbox[Coord.x1] and bbox[Coord.y2] > bbox[Coord.y1]:
            center_x, center_y = compute_center_xy(bbox, grid_size)

            # find grid index where the center is located
            grid_x = tf.cast(center_x, tf.int32)
            grid_y = tf.cast(center_y, tf.int32)

            if grid_x < grid_size[1] and grid_y < grid_size[0]:
                obj_indx = objects['label'][idx]

                center_w, center_h = compute_center_wh(bbox, grid_size)

                box = [center_x, center_y, center_w, center_h]
                # find the anchor that best predicts this box
                best_anchor = -1
                max_iou = tf.constant(-1, dtype=tf.float32)

                shifted_box = BoundingBox(0, 0, center_w, center_h)

                for anchor_id, anchor in enumerate(anchors):
                    iou = shifted_box.iou(anchor)

                    if max_iou < iou:
                        best_anchor = anchor_id
                        max_iou = iou

                targets = _update_bbox_target(box, grid_y, grid_x, best_anchor, targets)
                targets = _update_confidence_target(grid_y, grid_x, best_anchor, targets)
                targets = _update_class_target(grid_y, grid_x, best_anchor, obj_indx, targets)

    return targets


[docs]def decode_output(output, anchors, nb_classes, obj_threshold=0.5, nms_threshold=0.5):
    """ Decodes a YOLO model output.

    Args:
        output (tf.Tensor): model output to decode
        anchors (list): list of anchors boxes
        nb_classes (int): number of classes
        obj_threshold (float, optional): confidence threshold for a box. Defaults to 0.5.
        nms_threshold (float, optional): non-maximal supression threshold. Defaults to 0.5.

    Returns:
        List of `BoundingBox` objects
    """

    def _sigmoid(x):
        return 1. / (1. + np.exp(-x))

    def _softmax(x, axis=-1, t=-100.):
        x = x - np.max(x)

        if np.min(x) < t:
            x = x / np.min(x) * t

        e_x = np.exp(x)

        return e_x / e_x.sum(axis, keepdims=True)

    grid_h, grid_w, nb_box = output.shape[:3]

    boxes = []

    # decode the output by the network
    output[..., 4] = _sigmoid(output[..., 4])
    output[..., 5:] = output[..., 4][..., np.newaxis] * _softmax(output[..., 5:])
    output[..., 5:] *= output[..., 5:] > obj_threshold

    col, row, _ = np.meshgrid(np.arange(grid_w), np.arange(grid_h), np.arange(nb_box))

    x = (col + _sigmoid(output[..., 0])) / grid_w
    y = (row + _sigmoid(output[..., 1])) / grid_h
    w = np.array(anchors)[:, 0] * np.exp(output[..., 2]) / grid_w
    h = np.array(anchors)[:, 1] * np.exp(output[..., 3]) / grid_h

    x1 = np.maximum(x - w / 2, 0)
    y1 = np.maximum(y - h / 2, 0)
    x2 = np.minimum(x + w / 2, grid_w)
    y2 = np.minimum(y + h / 2, grid_h)

    confidence = output[..., 4]
    classes = output[..., 5:]
    mask = np.sum(classes, axis=-1) > 0
    indices = np.where(mask)

    for i in range(len(indices[0])):
        row_idx, col_idx, box_idx = indices[0][i], indices[1][i], indices[2][i]

        box = BoundingBox(x1[row_idx, col_idx, box_idx],
                          y1[row_idx, col_idx, box_idx],
                          x2[row_idx, col_idx, box_idx],
                          y2[row_idx, col_idx, box_idx],
                          confidence[row_idx, col_idx, box_idx],
                          classes[row_idx, col_idx, box_idx])

        boxes.append(box)

    # suppress non-maximal boxes
    for c in range(nb_classes):
        sorted_indices = np.argsort([box.classes[c] for box in boxes])[::-1]
        for ind, index_i in enumerate(sorted_indices):
            if boxes[index_i].score == 0 or boxes[index_i].classes[c] == 0:
                continue

            for j in range(ind + 1, len(sorted_indices)):
                index_j = sorted_indices[j]
                if boxes[index_j].score == 0:
                    continue

                # filter out redundant boxes (same class and overlapping too
                # much)
                if (boxes[index_i].iou(boxes[index_j]) >= nms_threshold) and (
                        c == boxes[index_i].get_label()) and (
                            c == boxes[index_j].get_label()):
                    boxes[index_j].score = 0

    # remove the boxes which are less likely than a obj_threshold
    boxes = [box for box in boxes if box.get_score() > obj_threshold]

    return boxes