[tensorflow] 使用 mask-凯发真人娱乐

mask_rcnn-2.0 网页链接：https://github.com/matterport/mask_rcnn/releases/tag/v2.0

mask_rcnn-master（matterport / mask_rcnn）网页链接：https://github.com/matterport/mask_rcnn

操作步骤

本文假设运行环境满足基本需求：python = 3.6.8, tensorflow-gpu = 1.12.0, keras = 2.0.8, matplotlib = 3.1.0……实验完成期间，需要安装许多依赖包，如：pycocotools, opencv-python, jupyter 等，请根据需求自行下载，如果程序运行期间出现相关错误，请查看依赖包版本是否正确。
下载 mask_rcnn-master 并解压，打开 anaconda prompt 进入运行环境，找到 mask_rcnn-master 项目下 demo.ipynb 所在的位置，并进入目录，输入“jupyter notebook”。

在 jupyter 中运行 demo.ipynb。第一遍运行时，程序会自动下载 mask_rcnn_coco.h5，所以时间会久一点，请耐心等待。运行结果如下：

从程序的运行结果中可以看出，输入一张图片，我们能够得到程序检测与分割的结果，但是无法获得每个 region 的特征（feature map）。
许多使用者在 issues 模块向项目作者提出了“想要获取 feature map”这一需求，例如：#1190，#1249，#1456。终于，kielnino 在 #1456 中给出了一个“改进后”的 model.py 文件。由于下载此文件需要翻越长城，我在此贴下代码，供大家使用。

"""

mask r-cnn

the main mask r-cnn model implementation.
凯发真人娱乐 copyright (c) 2017 matterport, inc.

licensed under the mit license (see license for details)

written by waleed abdulla

"""
import os

import random

import datetime

import re

import math

import logging

from collections import ordereddict

import multiprocessing

import numpy as np

import tensorflow as tf

import keras

import keras.backend as k

import keras.layers as kl

import keras.engine as ke

import keras.models as km
from mrcnn import utils
# requires tensorflow 1.3  and keras 2.0.8 .

from distutils.version import looseversion

assert looseversion(tf.__version__) >= looseversion("1.3")

assert looseversion(keras.__version__) >= looseversion('2.0.8')
############################################################

#  utility functions

############################################################
def log(text, array=none):

    """prints a text message. and, optionally, if a numpy array is provided it

    prints it's shape, min, and max values.

    """

    if array is not none:

        text = text.ljust(25)

        text  = ("shape: {:20}  ".format(str(array.shape)))

        if array.size:

            text  = ("min: {:10.5f}  max: {:10.5f}".format(array.min(),array.max()))

        else:

            text  = ("min: {:10}  max: {:10}".format("",""))

        text  = "  {}".format(array.dtype)

    print(text)
class batchnorm(kl.batchnormalization):

    """extends the keras batchnormalization class to allow a central place

    to make changes if needed.
    batch normalization has a negative effect on training if batches are small

    so this layer is often frozen (via setting in config class) and functions

    as linear layer.

    """

    def call(self, inputs, training=none):

        """

        note about training values:

            none: train bn layers. this is the normal mode

            false: freeze bn layers. good when batch size is small

            true: (don't use). set layer in training mode even when making inferences

        """

        return super(self.__class__, self).call(inputs, training=training)
def compute_backbone_shapes(config, image_shape):

    """computes the width and height of each stage of the backbone network.
    returns:

        [n, (height, width)]. where n is the number of stages

    """

    if callable(config.backbone):

        return config.compute_backbone_shape(image_shape)
    # currently supports resnet only

    assert config.backbone in ["resnet50", "resnet101"]

    return np.array(

        [[int(math.ceil(image_shape[0] / stride)),

            int(math.ceil(image_shape[1] / stride))]

            for stride in config.backbone_strides])
############################################################

#  resnet graph

############################################################
# code adopted from:

# https://github.com/fchollet/deep-learning-models/blob/master/resnet50.py
def identity_block(input_tensor, kernel_size, filters, stage, block,

                   use_bias=true, train_bn=true):

    """the identity_block is the block that has no conv layer at shortcut

    # arguments

        input_tensor: input tensor

        kernel_size: default 3, the kernel size of middle conv layer at main path

        filters: list of integers, the nb_filters of 3 conv layer at main path

        stage: integer, current stage label, used for generating layer names

        block: 'a','b'..., current block label, used for generating layer names

        use_bias: boolean. to use or not use a bias in conv layers.

        train_bn: boolean. train or freeze batch norm layers

    """

    nb_filter1, nb_filter2, nb_filter3 = filters

    conv_name_base = 'res'   str(stage)   block   '_branch'

    bn_name_base = 'bn'   str(stage)   block   '_branch'
    x = kl.conv2d(nb_filter1, (1, 1), name=conv_name_base   '2a',

                  use_bias=use_bias)(input_tensor)

    x = batchnorm(name=bn_name_base   '2a')(x, training=train_bn)

    x = kl.activation('relu')(x)
    x = kl.conv2d(nb_filter2, (kernel_size, kernel_size), padding='same',

                  name=conv_name_base   '2b', use_bias=use_bias)(x)

    x = batchnorm(name=bn_name_base   '2b')(x, training=train_bn)

    x = kl.activation('relu')(x)
    x = kl.conv2d(nb_filter3, (1, 1), name=conv_name_base   '2c',

                  use_bias=use_bias)(x)

    x = batchnorm(name=bn_name_base   '2c')(x, training=train_bn)
    x = kl.add()([x, input_tensor])

    x = kl.activation('relu', name='res'   str(stage)   block   '_out')(x)

    return x
def conv_block(input_tensor, kernel_size, filters, stage, block,

               strides=(2, 2), use_bias=true, train_bn=true):

    """conv_block is the block that has a conv layer at shortcut

    # arguments

        input_tensor: input tensor

        kernel_size: default 3, the kernel size of middle conv layer at main path

        filters: list of integers, the nb_filters of 3 conv layer at main path

        stage: integer, current stage label, used for generating layer names

        block: 'a','b'..., current block label, used for generating layer names

        use_bias: boolean. to use or not use a bias in conv layers.

        train_bn: boolean. train or freeze batch norm layers

    note that from stage 3, the first conv layer at main path is with subsample=(2,2)

    and the shortcut should have subsample=(2,2) as well

    """

    nb_filter1, nb_filter2, nb_filter3 = filters

    conv_name_base = 'res'   str(stage)   block   '_branch'

    bn_name_base = 'bn'   str(stage)   block   '_branch'
    x = kl.conv2d(nb_filter1, (1, 1), strides=strides,

                  name=conv_name_base   '2a', use_bias=use_bias)(input_tensor)

    x = batchnorm(name=bn_name_base   '2a')(x, training=train_bn)

    x = kl.activation('relu')(x)
    x = kl.conv2d(nb_filter2, (kernel_size, kernel_size), padding='same',

                  name=conv_name_base   '2b', use_bias=use_bias)(x)

    x = batchnorm(name=bn_name_base   '2b')(x, training=train_bn)

    x = kl.activation('relu')(x)
    x = kl.conv2d(nb_filter3, (1, 1), name=conv_name_base  

                  '2c', use_bias=use_bias)(x)

    x = batchnorm(name=bn_name_base   '2c')(x, training=train_bn)
    shortcut = kl.conv2d(nb_filter3, (1, 1), strides=strides,

                         name=conv_name_base   '', use_bias=use_bias)(input_tensor)

    shortcut = batchnorm(name=bn_name_base   '')(shortcut, training=train_bn)
    x = kl.add()([x, shortcut])

    x = kl.activation('relu', name='res'   str(stage)   block   '_out')(x)

    return x
def resnet_graph(input_image, architecture, stage5=false, train_bn=true):

    """build a resnet graph.

        architecture: can be resnet50 or resnet101

        stage5: boolean. if false, stage5 of the network is not created

        train_bn: boolean. train or freeze batch norm layers

    """

    assert architecture in ["resnet50", "resnet101"]

    # stage 1

    x = kl.zeropadding2d((3, 3))(input_image)

    x = kl.conv2d(64, (7, 7), strides=(2, 2), name='conv1', use_bias=true)(x)

    x = batchnorm(name='bn_conv1')(x, training=train_bn)

    x = kl.activation('relu')(x)

    c1 = x = kl.maxpooling2d((3, 3), strides=(2, 2), padding="same")(x)

    # stage 2

    x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), train_bn=train_bn)

    x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', train_bn=train_bn)

    c2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', train_bn=train_bn)

    # stage 3

    x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', train_bn=train_bn)

    x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', train_bn=train_bn)

    x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', train_bn=train_bn)

    c3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', train_bn=train_bn)

    # stage 4

    x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', train_bn=train_bn)

    block_count = {"resnet50": 5, "resnet101": 22}[architecture]

    for i in range(block_count):

        x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98   i), train_bn=train_bn)

    c4 = x

    # stage 5

    if stage5:

        x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', train_bn=train_bn)

        x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', train_bn=train_bn)

        c5 = x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', train_bn=train_bn)

    else:

        c5 = none

    return [c1, c2, c3, c4, c5]
############################################################

#  proposal layer

############################################################
def apply_box_deltas_graph(boxes, deltas):

    """applies the given deltas to the given boxes.

    boxes: [n, (y1, x1, y2, x2)] boxes to update

    deltas: [n, (dy, dx, log(dh), log(dw))] refinements to apply

    """

    # convert to y, x, h, w

    height = boxes[:, 2] - boxes[:, 0]

    width = boxes[:, 3] - boxes[:, 1]

    center_y = boxes[:, 0]   0.5 * height

    center_x = boxes[:, 1]   0.5 * width

    # apply deltas

    center_y  = deltas[:, 0] * height

    center_x  = deltas[:, 1] * width

    height *= tf.exp(deltas[:, 2])

    width *= tf.exp(deltas[:, 3])

    # convert back to y1, x1, y2, x2

    y1 = center_y - 0.5 * height

    x1 = center_x - 0.5 * width

    y2 = y1   height

    x2 = x1   width

    result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")

    return result
def clip_boxes_graph(boxes, window):

    """

    boxes: [n, (y1, x1, y2, x2)]

    window: [4] in the form y1, x1, y2, x2

    """

    # split

    wy1, wx1, wy2, wx2 = tf.split(window, 4)

    y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)

    # clip

    y1 = tf.maximum(tf.minimum(y1, wy2), wy1)

    x1 = tf.maximum(tf.minimum(x1, wx2), wx1)

    y2 = tf.maximum(tf.minimum(y2, wy2), wy1)

    x2 = tf.maximum(tf.minimum(x2, wx2), wx1)

    clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")

    clipped.set_shape((clipped.shape[0], 4))

    return clipped
class proposallayer(ke.layer):

    """receives anchor scores and selects a subset to pass as proposals

    to the second stage. filtering is done based on anchor scores and

    non-max suppression to remove overlaps. it also applies bounding

    box refinement deltas to anchors.
    inputs:

        rpn_probs: [batch, num_anchors, (bg prob, fg prob)]

        rpn_bbox: [batch, num_anchors, (dy, dx, log(dh), log(dw))]

        anchors: [batch, num_anchors, (y1, x1, y2, x2)] anchors in normalized coordinates
    returns:

        proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]

    """
    def __init__(self, proposal_count, nms_threshold, config=none, **kwargs):

        super(proposallayer, self).__init__(**kwargs)

        self.config = config

        self.proposal_count = proposal_count

        self.nms_threshold = nms_threshold
    def call(self, inputs):

        # box scores. use the foreground class confidence. [batch, num_rois, 1]

        scores = inputs[0][:, :, 1]

        # box deltas [batch, num_rois, 4]

        deltas = inputs[1]

        deltas = deltas * np.reshape(self.config.rpn_bbox_std_dev, [1, 1, 4])

        # anchors

        anchors = inputs[2]
        # improve performance by trimming to top anchors by score

        # and doing the rest on the smaller subset.

        pre_nms_limit = tf.minimum(self.config.pre_nms_limit, tf.shape(anchors)[1])

        ix = tf.nn.top_k(scores, pre_nms_limit, sorted=true,

                         name="top_anchors").indices

        scores = utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y),

                                   self.config.images_per_gpu)

        deltas = utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),

                                   self.config.images_per_gpu)

        pre_nms_anchors = utils.batch_slice([anchors, ix], lambda a, x: tf.gather(a, x),

                                    self.config.images_per_gpu,

                                    names=["pre_nms_anchors"])
        # apply deltas to anchors to get refined anchors.

        # [batch, n, (y1, x1, y2, x2)]

        boxes = utils.batch_slice([pre_nms_anchors, deltas],

                                  lambda x, y: apply_box_deltas_graph(x, y),

                                  self.config.images_per_gpu,

                                  names=["refined_anchors"])
        # clip to image boundaries. since we're in normalized coordinates,

        # clip to 0..1 range. [batch, n, (y1, x1, y2, x2)]

        window = np.array([0, 0, 1, 1], dtype=np.float32)

        boxes = utils.batch_slice(boxes,

                                  lambda x: clip_boxes_graph(x, window),

                                  self.config.images_per_gpu,

                                  names=["refined_anchors_clipped"])
        # filter out small boxes

        # according to xinlei chen's paper, this reduces detection accuracy

        # for small objects, so we're skipping it.
        # non-max suppression

        def nms(boxes, scores):

            indices = tf.image.non_max_suppression(

                boxes, scores, self.proposal_count,

                self.nms_threshold, name="rpn_non_max_suppression")

            proposals = tf.gather(boxes, indices)

            # pad if needed

            padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)

            proposals = tf.pad(proposals, [(0, padding), (0, 0)])

            return proposals

        proposals = utils.batch_slice([boxes, scores], nms,

                                      self.config.images_per_gpu)

        return proposals
    def compute_output_shape(self, input_shape):

        return (none, self.proposal_count, 4)
############################################################

#  roialign layer

############################################################
def log2_graph(x):

    """implementation of log2. tf doesn't have a native implementation."""

    return tf.log(x) / tf.log(2.0)
class pyramidroialign(ke.layer):

    """implements roi pooling on multiple levels of the feature pyramid.
    params:

    - pool_shape: [pool_height, pool_width] of the output pooled regions. usually [7, 7]
    inputs:

    - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized

             coordinates. possibly padded with zeros if not enough

             boxes to fill the array.

    - image_meta: [batch, (meta data)] image details. see compose_image_meta()

    - feature_maps: list of feature maps from different levels of the pyramid.

                    each is [batch, height, width, channels]
    output:

    pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels].

    the width and height are those specific in the pool_shape in the layer

    constructor.

    """
    def __init__(self, pool_shape, **kwargs):

        super(pyramidroialign, self).__init__(**kwargs)

        self.pool_shape = tuple(pool_shape)
    def call(self, inputs):

        # crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords

        boxes = inputs[0]
        # image meta

        # holds details about the image. see compose_image_meta()

        image_meta = inputs[1]
        # feature maps. list of feature maps from different level of the

        # feature pyramid. each is [batch, height, width, channels]

        feature_maps = inputs[2:]
        # assign each roi to a level in the pyramid based on the roi area.

        y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)

        h = y2 - y1

        w = x2 - x1

        # use shape of first image. images in a batch must have the same size.

        image_shape = parse_image_meta_graph(image_meta)['image_shape'][0]

        # equation 1 in the feature pyramid networks paper. account for

        # the fact that our coordinates are normalized here.

        # e.g. a 224x224 roi (in pixels) maps to p4

        image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)

        roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))

        roi_level = tf.minimum(5, tf.maximum(

            2, 4   tf.cast(tf.round(roi_level), tf.int32)))

        roi_level = tf.squeeze(roi_level, 2)
        # loop through levels and apply roi pooling to each. p2 to p5.

        pooled = []

        box_to_level = []

        for i, level in enumerate(range(2, 6)):

            ix = tf.where(tf.equal(roi_level, level))

            level_boxes = tf.gather_nd(boxes, ix)
            # box indices for crop_and_resize.

            box_indices = tf.cast(ix[:, 0], tf.int32)
            # keep track of which box is mapped to which level

            box_to_level.append(ix)
            # stop gradient propogation to roi proposals

            level_boxes = tf.stop_gradient(level_boxes)

            box_indices = tf.stop_gradient(box_indices)
            # crop and resize

            # from mask r-cnn paper: "we sample four regular locations, so

            # that we can evaluate either max or average pooling. in fact,

            # interpolating only a single value at each bin center (without

            # pooling) is nearly as effective."

            #

            # here we use the simplified approach of a single value per bin,

            # which is how it's done in tf.crop_and_resize()

            # result: [batch * num_boxes, pool_height, pool_width, channels]

            pooled.append(tf.image.crop_and_resize(

                feature_maps[i], level_boxes, box_indices, self.pool_shape,

                method="bilinear"))
        # pack pooled features into one tensor

        pooled = tf.concat(pooled, axis=0)
        # pack box_to_level mapping into one array and add another

        # column representing the order of pooled boxes

        box_to_level = tf.concat(box_to_level, axis=0)

        box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)

        box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],

                                 axis=1)
        # rearrange pooled features to match the order of the original boxes

        # sort box_to_level by batch then box index

        # tf doesn't have a way to sort by two columns, so merge them and sort.

        sorting_tensor = box_to_level[:, 0] * 100000   box_to_level[:, 1]

        ix = tf.nn.top_k(sorting_tensor, k=tf.shape(

            box_to_level)[0]).indices[::-1]

        ix = tf.gather(box_to_level[:, 2], ix)

        pooled = tf.gather(pooled, ix)
        # re-add the batch dimension

        shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)

        pooled = tf.reshape(pooled, shape)

        return pooled
    def compute_output_shape(self, input_shape):

        return input_shape[0][:2]   self.pool_shape   (input_shape[2][-1], )
############################################################

#  detection target layer

############################################################
def overlaps_graph(boxes1, boxes2):

    """computes iou overlaps between two sets of boxes.

    boxes1, boxes2: [n, (y1, x1, y2, x2)].

    """

    # 1. tile boxes2 and repeat boxes1. this allows us to compare

    # every boxes1 against every boxes2 without loops.

    # tf doesn't have an equivalent to np.repeat() so simulate it

    # using tf.tile() and tf.reshape.

    b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),

                            [1, 1, tf.shape(boxes2)[0]]), [-1, 4])

    b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])

    # 2. compute intersections

    b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)

    b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)

    y1 = tf.maximum(b1_y1, b2_y1)

    x1 = tf.maximum(b1_x1, b2_x1)

    y2 = tf.minimum(b1_y2, b2_y2)

    x2 = tf.minimum(b1_x2, b2_x2)

    intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)

    # 3. compute unions

    b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)

    b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)

    union = b1_area   b2_area - intersection

    # 4. compute iou and reshape to [boxes1, boxes2]

    iou = intersection / union

    overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])

    return overlaps
def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config):

    """generates detection targets for one image. subsamples proposals and

    generates target class ids, bounding box deltas, and masks for each.
    inputs:

    proposals: [post_nms_rois_training, (y1, x1, y2, x2)] in normalized coordinates. might

               be zero padded if there are not enough proposals.

    gt_class_ids: [max_gt_instances] int class ids

    gt_boxes: [max_gt_instances, (y1, x1, y2, x2)] in normalized coordinates.

    gt_masks: [height, width, max_gt_instances] of boolean type.
    returns: target rois and corresponding class ids, bounding box shifts,

    and masks.

    rois: [train_rois_per_image, (y1, x1, y2, x2)] in normalized coordinates

    class_ids: [train_rois_per_image]. integer class ids. zero padded.

    deltas: [train_rois_per_image, (dy, dx, log(dh), log(dw))]

    masks: [train_rois_per_image, height, width]. masks cropped to bbox

           boundaries and resized to neural network output size.
    note: returned arrays might be zero padded if not enough target rois.

    """

    # assertions

    asserts = [

        tf.assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],

                  name="roi_assertion"),

    ]

    with tf.control_dependencies(asserts):

        proposals = tf.identity(proposals)
    # remove zero padding

    proposals, _ = trim_zeros_graph(proposals, name="trim_proposals")

    gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes")

    gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros,

                                   name="trim_gt_class_ids")

    gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2,

                         name="trim_gt_masks")
    # handle coco crowds

    # a crowd box in coco is a bounding box around several instances. exclude

    # them from training. a crowd box is given a negative class id.

    crowd_ix = tf.where(gt_class_ids < 0)[:, 0]

    non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]

    crowd_boxes = tf.gather(gt_boxes, crowd_ix)

    gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)

    gt_boxes = tf.gather(gt_boxes, non_crowd_ix)

    gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)
    # compute overlaps matrix [proposals, gt_boxes]

    overlaps = overlaps_graph(proposals, gt_boxes)
    # compute overlaps with crowd boxes [proposals, crowd_boxes]

    crowd_overlaps = overlaps_graph(proposals, crowd_boxes)

    crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)

    no_crowd_bool = (crowd_iou_max < 0.001)
    # determine positive and negative rois

    roi_iou_max = tf.reduce_max(overlaps, axis=1)

    # 1. positive rois are those with >= 0.5 iou with a gt box

    positive_roi_bool = (roi_iou_max >= 0.5)

    positive_indices = tf.where(positive_roi_bool)[:, 0]

    # 2. negative rois are those with < 0.5 with every gt box. skip crowds.

    negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0]
    # subsample rois. aim for 33% positive

    # positive rois

    positive_count = int(config.train_rois_per_image *

                         config.roi_positive_ratio)

    positive_indices = tf.random_shuffle(positive_indices)[:positive_count]

    positive_count = tf.shape(positive_indices)[0]

    # negative rois. add enough to maintain positive:negative ratio.

    r = 1.0 / config.roi_positive_ratio

    negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count

    negative_indices = tf.random_shuffle(negative_indices)[:negative_count]

    # gather selected rois

    positive_rois = tf.gather(proposals, positive_indices)

    negative_rois = tf.gather(proposals, negative_indices)
    # assign positive rois to gt boxes.

    positive_overlaps = tf.gather(overlaps, positive_indices)

    roi_gt_box_assignment = tf.cond(

        tf.greater(tf.shape(positive_overlaps)[1], 0),

        true_fn = lambda: tf.argmax(positive_overlaps, axis=1),

        false_fn = lambda: tf.cast(tf.constant([]),tf.int64)

    )

    roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment)

    roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment)
    # compute bbox refinement for positive rois

    deltas = utils.box_refinement_graph(positive_rois, roi_gt_boxes)

    deltas /= config.bbox_std_dev
    # assign positive rois to gt masks

    # permute masks to [n, height, width, 1]

    transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1)

    # pick the right mask for each roi

    roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment)
    # compute mask targets

    boxes = positive_rois

    if config.use_mini_mask:

        # transform roi coordinates from normalized image space

        # to normalized mini-mask space.

        y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)

        gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)

        gt_h = gt_y2 - gt_y1

        gt_w = gt_x2 - gt_x1

        y1 = (y1 - gt_y1) / gt_h

        x1 = (x1 - gt_x1) / gt_w

        y2 = (y2 - gt_y1) / gt_h

        x2 = (x2 - gt_x1) / gt_w

        boxes = tf.concat([y1, x1, y2, x2], 1)

    box_ids = tf.range(0, tf.shape(roi_masks)[0])

    masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes,

                                     box_ids,

                                     config.mask_shape)

    # remove the extra dimension from masks.

    masks = tf.squeeze(masks, axis=3)
    # threshold mask pixels at 0.5 to have gt masks be 0 or 1 to use with

    # binary cross entropy loss.

    masks = tf.round(masks)
    # append negative rois and pad bbox deltas and masks that

    # are not used for negative rois with zeros.

    rois = tf.concat([positive_rois, negative_rois], axis=0)

    n = tf.shape(negative_rois)[0]

    p = tf.maximum(config.train_rois_per_image - tf.shape(rois)[0], 0)

    rois = tf.pad(rois, [(0, p), (0, 0)])

    roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, n   p), (0, 0)])

    roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, n   p)])

    deltas = tf.pad(deltas, [(0, n   p), (0, 0)])

    masks = tf.pad(masks, [[0, n   p], (0, 0), (0, 0)])
    return rois, roi_gt_class_ids, deltas, masks
class detectiontargetlayer(ke.layer):

    """subsamples proposals and generates target box refinement, class_ids,

    and masks for each.
    inputs:

    proposals: [batch, n, (y1, x1, y2, x2)] in normalized coordinates. might

               be zero padded if there are not enough proposals.

    gt_class_ids: [batch, max_gt_instances] integer class ids.

    gt_boxes: [batch, max_gt_instances, (y1, x1, y2, x2)] in normalized

              coordinates.

    gt_masks: [batch, height, width, max_gt_instances] of boolean type
    returns: target rois and corresponding class ids, bounding box shifts,

    and masks.

    rois: [batch, train_rois_per_image, (y1, x1, y2, x2)] in normalized

          coordinates

    target_class_ids: [batch, train_rois_per_image]. integer class ids.

    target_deltas: [batch, train_rois_per_image, (dy, dx, log(dh), log(dw)]

    target_mask: [batch, train_rois_per_image, height, width]

                 masks cropped to bbox boundaries and resized to neural

                 network output size.
    note: returned arrays might be zero padded if not enough target rois.

    """
    def __init__(self, config, **kwargs):

        super(detectiontargetlayer, self).__init__(**kwargs)

        self.config = config
    def call(self, inputs):

        proposals = inputs[0]

        gt_class_ids = inputs[1]

        gt_boxes = inputs[2]

        gt_masks = inputs[3]
        # slice the batch and run a graph for each slice

        # todo: rename target_bbox to target_deltas for clarity

        names = ["rois", "target_class_ids", "target_bbox", "target_mask"]

        outputs = utils.batch_slice(

            [proposals, gt_class_ids, gt_boxes, gt_masks],

            lambda w, x, y, z: detection_targets_graph(

                w, x, y, z, self.config),

            self.config.images_per_gpu, names=names)

        return outputs
    def compute_output_shape(self, input_shape):

        return [

            (none, self.config.train_rois_per_image, 4),  # rois

            (none, self.config.train_rois_per_image),  # class_ids

            (none, self.config.train_rois_per_image, 4),  # deltas

            (none, self.config.train_rois_per_image, self.config.mask_shape[0],

             self.config.mask_shape[1])  # masks

        ]
    def compute_mask(self, inputs, mask=none):

        return [none, none, none, none]
############################################################

#  detection layer

############################################################
def refine_detections_graph(rois, probs, deltas, window, feature_maps, config):

    """refine classified proposals and filter overlaps and return final

    detections.
    inputs:

        rois: [n, (y1, x1, y2, x2)] in normalized coordinates

        probs: [n, num_classes]. class probabilities.

        deltas: [n, num_classes, (dy, dx, log(dh), log(dw))]. class-specific

                bounding box deltas.

        window: (y1, x1, y2, x2) in normalized coordinates. the part of the image

            that contains the image excluding the padding.
    returns detections shaped: [num_detections, (y1, x1, y2, x2, class_id, score)] where

        coordinates are normalized.

    """

    # class ids per roi

    class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)

    # class probability of the top class of each roi

    indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)

    class_scores = tf.gather_nd(probs, indices)

    # class-specific bounding box deltas

    deltas_specific = tf.gather_nd(deltas, indices)

    # apply bounding box deltas

    # shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates

    refined_rois = apply_box_deltas_graph(

        rois, deltas_specific * config.bbox_std_dev)

    # clip boxes to image window

    refined_rois = clip_boxes_graph(refined_rois, window)
    # todo: filter out boxes with zero area
    # filter out background boxes

    keep = tf.where(class_ids > 0)[:, 0]

    # filter out low confidence boxes

    if config.detection_min_confidence:

        conf_keep = tf.where(class_scores >= config.detection_min_confidence)[:, 0]

        keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),

                                        tf.expand_dims(conf_keep, 0))

        keep = tf.sparse_tensor_to_dense(keep)[0]
    # apply per-class nms

    # 1. prepare variables

    pre_nms_class_ids = tf.gather(class_ids, keep)

    pre_nms_scores = tf.gather(class_scores, keep)

    pre_nms_rois = tf.gather(refined_rois,   keep)

    unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
    def nms_keep_map(class_id):

        """apply non-maximum suppression on rois of the given class."""

        # indices of rois of the given class

        ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]

        # apply nms

        class_keep = tf.image.non_max_suppression(

                tf.gather(pre_nms_rois, ixs),

                tf.gather(pre_nms_scores, ixs),

                max_output_size=config.detection_max_instances,

                iou_threshold=config.detection_nms_threshold)

        # map indices

        class_keep = tf.gather(keep, tf.gather(ixs, class_keep))

        # pad with -1 so returned tensors have the same shape

        gap = config.detection_max_instances - tf.shape(class_keep)[0]

        class_keep = tf.pad(class_keep, [(0, gap)],

                            mode='constant', constant_values=-1)

        # set shape so map_fn() can infer result shape

        class_keep.set_shape([config.detection_max_instances])

        return class_keep
    # 2. map over class ids

    nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,

                         dtype=tf.int64)

    # 3. merge results into one list, and remove -1 padding

    nms_keep = tf.reshape(nms_keep, [-1])

    nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])

    # 4. compute intersection between keep and nms_keep

    keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),

                                    tf.expand_dims(nms_keep, 0))

    keep = tf.sparse_tensor_to_dense(keep)[0]

    # keep top detections

    roi_count = config.detection_max_instances

    class_scores_keep = tf.gather(class_scores, keep)

    num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)

    top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=true)[1]

    keep = tf.gather(keep, top_ids)
    # arrange output as [n, (y1, x1, y2, x2, class_id, score)]

    # coordinates are normalized.

    detections = tf.concat([

        tf.gather(refined_rois, keep),

        tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],

        tf.gather(class_scores, keep)[..., tf.newaxis],

        tf.gather(feature_maps, keep)

        ], axis=1)
    # pad with zeros if detections < detection_max_instances

    gap = config.detection_max_instances - tf.shape(detections)[0]

    detections = tf.pad(detections, [(0, gap), (0, 0)], "constant")

    return detections
class detectionlayer(ke.layer):

    """takes classified proposal boxes and their bounding box deltas and

    returns the final detection boxes.
    returns:

    [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where

    coordinates are normalized.

    """
    def __init__(self, config=none, **kwargs):

        super(detectionlayer, self).__init__(**kwargs)

        self.config = config
    def call(self, inputs):

        rois = inputs[0]

        mrcnn_class = inputs[1]

        mrcnn_bbox = inputs[2]

        image_meta = inputs[3]

        feature_maps = inputs[4]
        # get windows of images in normalized coordinates. windows are the area

        # in the image that excludes the padding.

        # use the shape of the first image in the batch to normalize the window

        # because we know that all images get resized to the same size.

        m = parse_image_meta_graph(image_meta)

        image_shape = m['image_shape'][0]

        window = norm_boxes_graph(m['window'], image_shape[:2])
        # run detection refinement graph on each item in the batch

        detections_batch = utils.batch_slice(

            [rois, mrcnn_class, mrcnn_bbox, window, feature_maps],

            lambda x, y, w, z, f: refine_detections_graph(x, y, w, z, f, self.config),

            self.config.images_per_gpu)
        # reshape output

        # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in

        # normalized coordinates

        return tf.reshape(

            detections_batch,

            [self.config.batch_size, self.config.detection_max_instances, 6   self.config.fpn_classif_fc_layers_size])
    def compute_output_shape(self, input_shape):

        return (none, self.config.detection_max_instances, 6   self.config.fpn_classif_fc_layers_size)
############################################################

#  region proposal network (rpn)

############################################################
def rpn_graph(feature_map, anchors_per_location, anchor_stride):

    """builds the computation graph of region proposal network.
    feature_map: backbone features [batch, height, width, depth]

    anchors_per_location: number of anchors per pixel in the feature map

    anchor_stride: controls the density of anchors. typically 1 (anchors for

                   every pixel in the feature map), or 2 (every other pixel).
    returns:

        rpn_class_logits: [batch, h * w * anchors_per_location, 2] anchor classifier logits (before softmax)

        rpn_probs: [batch, h * w * anchors_per_location, 2] anchor classifier probabilities.

        rpn_bbox: [batch, h * w * anchors_per_location, (dy, dx, log(dh), log(dw))] deltas to be

                  applied to anchors.

    """

    # todo: check if stride of 2 causes alignment issues if the feature map

    # is not even.

    # shared convolutional base of the rpn

    shared = kl.conv2d(512, (3, 3), padding='same', activation='relu',

                       strides=anchor_stride,

                       name='rpn_conv_shared')(feature_map)
    # anchor score. [batch, height, width, anchors per location * 2].

    x = kl.conv2d(2 * anchors_per_location, (1, 1), padding='valid',

                  activation='linear', name='rpn_class_raw')(shared)
    # reshape to [batch, anchors, 2]

    rpn_class_logits = kl.lambda(

        lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
    # softmax on last dimension of bg/fg.

    rpn_probs = kl.activation(

        "softmax", name="rpn_class_xxx")(rpn_class_logits)
    # bounding box refinement. [batch, h, w, anchors per location * depth]

    # where depth is [x, y, log(w), log(h)]

    x = kl.conv2d(anchors_per_location * 4, (1, 1), padding="valid",

                  activation='linear', name='rpn_bbox_pred')(shared)
    # reshape to [batch, anchors, 4]

    rpn_bbox = kl.lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
    return [rpn_class_logits, rpn_probs, rpn_bbox]
def build_rpn_model(anchor_stride, anchors_per_location, depth):

    """builds a keras model of the region proposal network.

    it wraps the rpn graph so it can be used multiple times with shared

    weights.
    anchors_per_location: number of anchors per pixel in the feature map

    anchor_stride: controls the density of anchors. typically 1 (anchors for

                   every pixel in the feature map), or 2 (every other pixel).

    depth: depth of the backbone feature map.
    returns a keras model object. the model outputs, when called, are:

    rpn_class_logits: [batch, h * w * anchors_per_location, 2] anchor classifier logits (before softmax)

    rpn_probs: [batch, h * w * anchors_per_location, 2] anchor classifier probabilities.

    rpn_bbox: [batch, h * w * anchors_per_location, (dy, dx, log(dh), log(dw))] deltas to be

                applied to anchors.

    """

    input_feature_map = kl.input(shape=[none, none, depth],

                                 name="input_rpn_feature_map")

    outputs = rpn_graph(input_feature_map, anchors_per_location, anchor_stride)

    return km.model([input_feature_map], outputs, name="rpn_model")
############################################################

#  feature pyramid network heads

############################################################
def fpn_classifier_graph(rois, feature_maps, image_meta,

                         pool_size, num_classes, train_bn=true,

                         fc_layers_size=1024):

    """builds the computation graph of the feature pyramid network classifier

    and regressor heads.
    rois: [batch, num_rois, (y1, x1, y2, x2)] proposal boxes in normalized

          coordinates.

    feature_maps: list of feature maps from different layers of the pyramid,

                  [p2, p3, p4, p5]. each has a different resolution.

    image_meta: [batch, (meta data)] image details. see compose_image_meta()

    pool_size: the width of the square feature map generated from roi pooling.

    num_classes: number of classes, which determines the depth of the results

    train_bn: boolean. train or freeze batch norm layers

    fc_layers_size: size of the 2 fc layers
    returns:

        logits: [batch, num_rois, num_classes] classifier logits (before softmax)

        probs: [batch, num_rois, num_classes] classifier probabilities

        bbox_deltas: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))] deltas to apply to

                     proposal boxes

    """

    # roi pooling

    # shape: [batch, num_rois, pool_size, pool_size, channels]

    x = pyramidroialign([pool_size, pool_size],

                        name="roi_align_classifier")([rois, image_meta]   feature_maps)

    # two 1024 fc layers (implemented with conv2d for consistency)

    x = kl.timedistributed(kl.conv2d(fc_layers_size, (pool_size, pool_size), padding="valid"),

                           name="mrcnn_class_conv1")(x)

    x = kl.timedistributed(batchnorm(), name='mrcnn_class_bn1')(x, training=train_bn)

    x = kl.activation('relu')(x)

    x = kl.timedistributed(kl.conv2d(fc_layers_size, (1, 1)),

                           name="mrcnn_class_conv2")(x)

    x = kl.timedistributed(batchnorm(), name='mrcnn_class_bn2')(x, training=train_bn)

    x = kl.activation('relu')(x)
    shared = kl.lambda(lambda x: k.squeeze(k.squeeze(x, 3), 2),

                       name="pool_squeeze")(x)
    # classifier head

    mrcnn_class_logits = kl.timedistributed(kl.dense(num_classes),

                                            name='mrcnn_class_logits')(shared)

    mrcnn_probs = kl.timedistributed(kl.activation("softmax"),

                                     name="mrcnn_class")(mrcnn_class_logits)
    # bbox head

    # [batch, num_rois, num_classes * (dy, dx, log(dh), log(dw))]

    x = kl.timedistributed(kl.dense(num_classes * 4, activation='linear'),

                           name='mrcnn_bbox_fc')(shared)

    # reshape to [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]

    s = k.int_shape(x)

    mrcnn_bbox = kl.reshape((s[1], num_classes, 4), name="mrcnn_bbox")(x)
    return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox, shared
def build_fpn_mask_graph(rois, feature_maps, image_meta,

                         pool_size, num_classes, train_bn=true):

    """builds the computation graph of the mask head of feature pyramid network.
    rois: [batch, num_rois, (y1, x1, y2, x2)] proposal boxes in normalized

          coordinates.

    feature_maps: list of feature maps from different layers of the pyramid,

                  [p2, p3, p4, p5]. each has a different resolution.

    image_meta: [batch, (meta data)] image details. see compose_image_meta()

    pool_size: the width of the square feature map generated from roi pooling.

    num_classes: number of classes, which determines the depth of the results

    train_bn: boolean. train or freeze batch norm layers
    returns: masks [batch, num_rois, mask_pool_size, mask_pool_size, num_classes]

    """

    # roi pooling

    # shape: [batch, num_rois, mask_pool_size, mask_pool_size, channels]

    x = pyramidroialign([pool_size, pool_size],

                        name="roi_align_mask")([rois, image_meta]   feature_maps)
    # conv layers

    x = kl.timedistributed(kl.conv2d(256, (3, 3), padding="same"),

                           name="mrcnn_mask_conv1")(x)

    x = kl.timedistributed(batchnorm(),

                           name='mrcnn_mask_bn1')(x, training=train_bn)

    x = kl.activation('relu')(x)
    x = kl.timedistributed(kl.conv2d(256, (3, 3), padding="same"),

                           name="mrcnn_mask_conv2")(x)

    x = kl.timedistributed(batchnorm(),

                           name='mrcnn_mask_bn2')(x, training=train_bn)

    x = kl.activation('relu')(x)
    x = kl.timedistributed(kl.conv2d(256, (3, 3), padding="same"),

                           name="mrcnn_mask_conv3")(x)

    x = kl.timedistributed(batchnorm(),

                           name='mrcnn_mask_bn3')(x, training=train_bn)

    x = kl.activation('relu')(x)
    x = kl.timedistributed(kl.conv2d(256, (3, 3), padding="same"),

                           name="mrcnn_mask_conv4")(x)

    x = kl.timedistributed(batchnorm(),

                           name='mrcnn_mask_bn4')(x, training=train_bn)

    x = kl.activation('relu')(x)
    x = kl.timedistributed(kl.conv2dtranspose(256, (2, 2), strides=2, activation="relu"),

                           name="mrcnn_mask_deconv")(x)

    x = kl.timedistributed(kl.conv2d(num_classes, (1, 1), strides=1, activation="sigmoid"),

                           name="mrcnn_mask")(x)

    return x
############################################################

#  loss functions

############################################################
def smooth_l1_loss(y_true, y_pred):

    """implements smooth-l1 loss.

    y_true and y_pred are typically: [n, 4], but could be any shape.

    """

    diff = k.abs(y_true - y_pred)

    less_than_one = k.cast(k.less(diff, 1.0), "float32")

    loss = (less_than_one * 0.5 * diff**2)   (1 - less_than_one) * (diff - 0.5)

    return loss
def rpn_class_loss_graph(rpn_match, rpn_class_logits):

    """rpn anchor classifier loss.
    rpn_match: [batch, anchors, 1]. anchor match type. 1=positive,

               -1=negative, 0=neutral anchor.

    rpn_class_logits: [batch, anchors, 2]. rpn classifier logits for bg/fg.

    """

    # squeeze last dim to simplify

    rpn_match = tf.squeeze(rpn_match, -1)

    # get anchor classes. convert the -1/ 1 match to 0/1 values.

    anchor_class = k.cast(k.equal(rpn_match, 1), tf.int32)

    # positive and negative anchors contribute to the loss,

    # but neutral anchors (match value = 0) don't.

    indices = tf.where(k.not_equal(rpn_match, 0))

    # pick rows that contribute to the loss and filter out the rest.

    rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)

    anchor_class = tf.gather_nd(anchor_class, indices)

    # cross entropy loss

    loss = k.sparse_categorical_crossentropy(target=anchor_class,

                                             output=rpn_class_logits,

                                             from_logits=true)

    loss = k.switch(tf.size(loss) > 0, k.mean(loss), tf.constant(0.0))

    return loss
def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox):

    """return the rpn bounding box loss graph.
    config: the model config object.

    target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))].

        uses 0 padding to fill in unsed bbox deltas.

    rpn_match: [batch, anchors, 1]. anchor match type. 1=positive,

               -1=negative, 0=neutral anchor.

    rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]

    """

    # positive anchors contribute to the loss, but negative and

    # neutral anchors (match value of 0 or -1) don't.

    rpn_match = k.squeeze(rpn_match, -1)

    indices = tf.where(k.equal(rpn_match, 1))
    # pick bbox deltas that contribute to the loss

    rpn_bbox = tf.gather_nd(rpn_bbox, indices)
    # trim target bounding box deltas to the same length as rpn_bbox.

    batch_counts = k.sum(k.cast(k.equal(rpn_match, 1), tf.int32), axis=1)

    target_bbox = batch_pack_graph(target_bbox, batch_counts,

                                   config.images_per_gpu)
    loss = smooth_l1_loss(target_bbox, rpn_bbox)
    loss = k.switch(tf.size(loss) > 0, k.mean(loss), tf.constant(0.0))

    return loss
def mrcnn_class_loss_graph(target_class_ids, pred_class_logits,

                           active_class_ids):

    """loss for the classifier head of mask rcnn.
    target_class_ids: [batch, num_rois]. integer class ids. uses zero

        padding to fill in the array.

    pred_class_logits: [batch, num_rois, num_classes]

    active_class_ids: [batch, num_classes]. has a value of 1 for

        classes that are in the dataset of the image, and 0

        for classes that are not in the dataset.

    """

    # during model building, keras calls this function with

    # target_class_ids of type float32. unclear why. cast it

    # to int to get around it.

    target_class_ids = tf.cast(target_class_ids, 'int64')
    # find predictions of classes that are not in the dataset.

    pred_class_ids = tf.argmax(pred_class_logits, axis=2)

    # todo: update this line to work with batch > 1. right now it assumes all

    #       images in a batch have the same active_class_ids

    pred_active = tf.gather(active_class_ids[0], pred_class_ids)
    # loss

    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(

        labels=target_class_ids, logits=pred_class_logits)
    # erase losses of predictions of classes that are not in the active

    # classes of the image.

    loss = loss * pred_active
    # computer loss mean. use only predictions that contribute

    # to the loss to get a correct mean.

    loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active)

    return loss
def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox):

    """loss for mask r-cnn bounding box refinement.
    target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))]

    target_class_ids: [batch, num_rois]. integer class ids.

    pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]

    """

    # reshape to merge batch and roi dimensions for simplicity.

    target_class_ids = k.reshape(target_class_ids, (-1,))

    target_bbox = k.reshape(target_bbox, (-1, 4))

    pred_bbox = k.reshape(pred_bbox, (-1, k.int_shape(pred_bbox)[2], 4))
    # only positive rois contribute to the loss. and only

    # the right class_id of each roi. get their indices.

    positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]

    positive_roi_class_ids = tf.cast(

        tf.gather(target_class_ids, positive_roi_ix), tf.int64)

    indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
    # gather the deltas (predicted and true) that contribute to loss

    target_bbox = tf.gather(target_bbox, positive_roi_ix)

    pred_bbox = tf.gather_nd(pred_bbox, indices)
    # smooth-l1 loss

    loss = k.switch(tf.size(target_bbox) > 0,

                    smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox),

                    tf.constant(0.0))

    loss = k.mean(loss)

    return loss
def mrcnn_mask_loss_graph(target_masks, target_class_ids, pred_masks):

    """mask binary cross-entropy loss for the masks head.
    target_masks: [batch, num_rois, height, width].

        a float32 tensor of values 0 or 1. uses zero padding to fill array.

    target_class_ids: [batch, num_rois]. integer class ids. zero padded.

    pred_masks: [batch, proposals, height, width, num_classes] float32 tensor

                with values from 0 to 1.

    """

    # reshape for simplicity. merge first two dimensions into one.

    target_class_ids = k.reshape(target_class_ids, (-1,))

    mask_shape = tf.shape(target_masks)

    target_masks = k.reshape(target_masks, (-1, mask_shape[2], mask_shape[3]))

    pred_shape = tf.shape(pred_masks)

    pred_masks = k.reshape(pred_masks,

                           (-1, pred_shape[2], pred_shape[3], pred_shape[4]))

    # permute predicted masks to [n, num_classes, height, width]

    pred_masks = tf.transpose(pred_masks, [0, 3, 1, 2])
    # only positive rois contribute to the loss. and only

    # the class specific mask of each roi.

    positive_ix = tf.where(target_class_ids > 0)[:, 0]

    positive_class_ids = tf.cast(

        tf.gather(target_class_ids, positive_ix), tf.int64)

    indices = tf.stack([positive_ix, positive_class_ids], axis=1)
    # gather the masks (predicted and true) that contribute to loss

    y_true = tf.gather(target_masks, positive_ix)

    y_pred = tf.gather_nd(pred_masks, indices)
    # compute binary cross entropy. if no positive rois, then return 0.

    # shape: [batch, roi, num_classes]

    loss = k.switch(tf.size(y_true) > 0,

                    k.binary_crossentropy(target=y_true, output=y_pred),

                    tf.constant(0.0))

    loss = k.mean(loss)

    return loss
############################################################

#  data generator

############################################################
def load_image_gt(dataset, config, image_id, augment=false, augmentation=none,

                  use_mini_mask=false):

    """load and return ground truth data for an image (image, mask, bounding boxes).
    augment: (deprecated. use augmentation instead). if true, apply random

        image augmentation. currently, only horizontal flipping is offered.

    augmentation: optional. an imgaug (https://github.com/aleju/imgaug) augmentation.

        for example, passing imgaug.augmenters.fliplr(0.5) flips images

        right/left 50% of the time.

    use_mini_mask: if false, returns full-size masks that are the same height

        and width as the original image. these can be big, for example

        1024x1024x100 (for 100 instances). mini masks are smaller, typically,

        224x224 and are generated by extracting the bounding box of the

        object and resizing it to mini_mask_shape.
    returns:

    image: [height, width, 3]

    shape: the original shape of the image before resizing and cropping.

    class_ids: [instance_count] integer class ids

    bbox: [instance_count, (y1, x1, y2, x2)]

    mask: [height, width, instance_count]. the height and width are those

        of the image unless use_mini_mask is true, in which case they are

        defined in mini_mask_shape.

    """

    # load image and mask

    image = dataset.load_image(image_id)

    mask, class_ids = dataset.load_mask(image_id)

    original_shape = image.shape

    image, window, scale, padding, crop = utils.resize_image(

        image,

        min_dim=config.image_min_dim,

        min_scale=config.image_min_scale,

        max_dim=config.image_max_dim,

        mode=config.image_resize_mode)

    mask = utils.resize_mask(mask, scale, padding, crop)
    # random horizontal flips.

    # todo: will be removed in a future update in favor of augmentation

    if augment:

        logging.warning("'augment' is deprecated. use 'augmentation' instead.")

        if random.randint(0, 1):

            image = np.fliplr(image)

            mask = np.fliplr(mask)
    # augmentation

    # this requires the imgaug lib (https://github.com/aleju/imgaug)

    if augmentation:

        import imgaug
        # augmenters that are safe to apply to masks

        # some, such as affine, have settings that make them unsafe, so always

        # test your augmentation on masks

        mask_augmenters = ["sequential", "someof", "oneof", "sometimes",

                           "fliplr", "flipud", "cropandpad",

                           "affine", "piecewiseaffine"]
        def hook(images, augmenter, parents, default):

            """determines which augmenters to apply to masks."""

            return augmenter.__class__.__name__ in mask_augmenters
        # store shapes before augmentation to compare

        image_shape = image.shape

        mask_shape = mask.shape

        # make augmenters deterministic to apply similarly to images and masks

        det = augmentation.to_deterministic()

        image = det.augment_image(image)

        # change mask to np.uint8 because imgaug doesn't support np.bool

        mask = det.augment_image(mask.astype(np.uint8),

                                 hooks=imgaug.hooksimages(activator=hook))

        # verify that shapes didn't change

        assert image.shape == image_shape, "augmentation shouldn't change image size"

        assert mask.shape == mask_shape, "augmentation shouldn't change mask size"

        # change mask back to bool

        mask = mask.astype(np.bool)
    # note that some boxes might be all zeros if the corresponding mask got cropped out.

    # and here is to filter them out

    _idx = np.sum(mask, axis=(0, 1)) > 0

    mask = mask[:, :, _idx]

    class_ids = class_ids[_idx]

    # bounding boxes. note that some boxes might be all zeros

    # if the corresponding mask got cropped out.

    # bbox: [num_instances, (y1, x1, y2, x2)]

    bbox = utils.extract_bboxes(mask)
    # active classes

    # different datasets have different classes, so track the

    # classes supported in the dataset of this image.

    active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32)

    source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]]

    active_class_ids[source_class_ids] = 1
    # resize masks to smaller size to reduce memory usage

    if use_mini_mask:

        mask = utils.minimize_mask(bbox, mask, config.mini_mask_shape)
    # image meta data

    image_meta = compose_image_meta(image_id, original_shape, image.shape,

                                    window, scale, active_class_ids)
    return image, image_meta, class_ids, bbox, mask
def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config):

    """generate targets for training stage 2 classifier and mask heads.

    this is not used in normal training. it's useful for debugging or to train

    the mask rcnn heads without using the rpn head.
    inputs:

    rpn_rois: [n, (y1, x1, y2, x2)] proposal boxes.

    gt_class_ids: [instance count] integer class ids

    gt_boxes: [instance count, (y1, x1, y2, x2)]

    gt_masks: [height, width, instance count] ground truth masks. can be full

              size or mini-masks.
    returns:

    rois: [train_rois_per_image, (y1, x1, y2, x2)]

    class_ids: [train_rois_per_image]. integer class ids.

    bboxes: [train_rois_per_image, num_classes, (y, x, log(h), log(w))]. class-specific

            bbox refinements.

    masks: [train_rois_per_image, height, width, num_classes). class specific masks cropped

           to bbox boundaries and resized to neural network output size.

    """

    assert rpn_rois.shape[0] > 0

    assert gt_class_ids.dtype == np.int32, "expected int but got {}".format(

        gt_class_ids.dtype)

    assert gt_boxes.dtype == np.int32, "expected int but got {}".format(

        gt_boxes.dtype)

    assert gt_masks.dtype == np.bool_, "expected bool but got {}".format(

        gt_masks.dtype)
    # it's common to add gt boxes to rois but we don't do that here because

    # according to xinlei chen's paper, it doesn't help.
    # trim empty padding in gt_boxes and gt_masks parts

    instance_ids = np.where(gt_class_ids > 0)[0]

    assert instance_ids.shape[0] > 0, "image must contain instances."

    gt_class_ids = gt_class_ids[instance_ids]

    gt_boxes = gt_boxes[instance_ids]

    gt_masks = gt_masks[:, :, instance_ids]
    # compute areas of rois and ground truth boxes.

    rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * \

        (rpn_rois[:, 3] - rpn_rois[:, 1])

    gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * \

        (gt_boxes[:, 3] - gt_boxes[:, 1])
    # compute overlaps [rpn_rois, gt_boxes]

    overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))

    for i in range(overlaps.shape[1]):

        gt = gt_boxes[i]

        overlaps[:, i] = utils.compute_iou(

            gt, rpn_rois, gt_box_area[i], rpn_roi_area)
    # assign rois to gt boxes

    rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)

    rpn_roi_iou_max = overlaps[np.arange(

        overlaps.shape[0]), rpn_roi_iou_argmax]

    # gt box assigned to each roi

    rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]

    rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]
    # positive rois are those with >= 0.5 iou with a gt box.

    fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]
    # negative rois are those with max iou 0.1-0.5 (hard example mining)

    # todo: to hard example mine or not to hard example mine, that's the question

    # bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]

    bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
    # subsample rois. aim for 33% foreground.

    # fg

    fg_roi_count = int(config.train_rois_per_image * config.roi_positive_ratio)

    if fg_ids.shape[0] > fg_roi_count:

        keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=false)

    else:

        keep_fg_ids = fg_ids

    # bg

    remaining = config.train_rois_per_image - keep_fg_ids.shape[0]

    if bg_ids.shape[0] > remaining:

        keep_bg_ids = np.random.choice(bg_ids, remaining, replace=false)

    else:

        keep_bg_ids = bg_ids

    # combine indices of rois to keep

    keep = np.concatenate([keep_fg_ids, keep_bg_ids])

    # need more?

    remaining = config.train_rois_per_image - keep.shape[0]

    if remaining > 0:

        # looks like we don't have enough samples to maintain the desired

        # balance. reduce requirements and fill in the rest. this is

        # likely different from the mask rcnn paper.
        # there is a small chance we have neither fg nor bg samples.

        if keep.shape[0] == 0:

            # pick bg regions with easier iou threshold

            bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]

            assert bg_ids.shape[0] >= remaining

            keep_bg_ids = np.random.choice(bg_ids, remaining, replace=false)

            assert keep_bg_ids.shape[0] == remaining

            keep = np.concatenate([keep, keep_bg_ids])

        else:

            # fill the rest with repeated bg rois.

            keep_extra_ids = np.random.choice(

                keep_bg_ids, remaining, replace=true)

            keep = np.concatenate([keep, keep_extra_ids])

    assert keep.shape[0] == config.train_rois_per_image, \

        "keep doesn't match roi batch size {}, {}".format(

            keep.shape[0], config.train_rois_per_image)
    # reset the gt boxes assigned to bg rois.

    rpn_roi_gt_boxes[keep_bg_ids, :] = 0

    rpn_roi_gt_class_ids[keep_bg_ids] = 0
    # for each kept roi, assign a class_id, and for fg rois also add bbox refinement.

    rois = rpn_rois[keep]

    roi_gt_boxes = rpn_roi_gt_boxes[keep]

    roi_gt_class_ids = rpn_roi_gt_class_ids[keep]

    roi_gt_assignment = rpn_roi_iou_argmax[keep]
    # class-aware bbox deltas. [y, x, log(h), log(w)]

    bboxes = np.zeros((config.train_rois_per_image,

                       config.num_classes, 4), dtype=np.float32)

    pos_ids = np.where(roi_gt_class_ids > 0)[0]

    bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = utils.box_refinement(

        rois[pos_ids], roi_gt_boxes[pos_ids, :4])

    # normalize bbox refinements

    bboxes /= config.bbox_std_dev
    # generate class-specific target masks

    masks = np.zeros((config.train_rois_per_image, config.mask_shape[0], config.mask_shape[1], config.num_classes),

                     dtype=np.float32)

    for i in pos_ids:

        class_id = roi_gt_class_ids[i]

        assert class_id > 0, "class id must be greater than 0"

        gt_id = roi_gt_assignment[i]

        class_mask = gt_masks[:, :, gt_id]
        if config.use_mini_mask:

            # create a mask placeholder, the size of the image

            placeholder = np.zeros(config.image_shape[:2], dtype=bool)

            # gt box

            gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id]

            gt_w = gt_x2 - gt_x1

            gt_h = gt_y2 - gt_y1

            # resize mini mask to size of gt box

            placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \

                np.round(utils.resize(class_mask, (gt_h, gt_w))).astype(bool)

            # place the mini batch in the placeholder

            class_mask = placeholder
        # pick part of the mask and resize it

        y1, x1, y2, x2 = rois[i].astype(np.int32)

        m = class_mask[y1:y2, x1:x2]

        mask = utils.resize(m, config.mask_shape)

        masks[i, :, :, class_id] = mask
    return rois, roi_gt_class_ids, bboxes, masks
def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config):

    """given the anchors and gt boxes, compute overlaps and identify positive

    anchors and deltas to refine them to match their corresponding gt boxes.
    anchors: [num_anchors, (y1, x1, y2, x2)]

    gt_class_ids: [num_gt_boxes] integer class ids.

    gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
    returns:

    rpn_match: [n] (int32) matches between anchors and gt boxes.

               1 = positive anchor, -1 = negative anchor, 0 = neutral

    rpn_bbox: [n, (dy, dx, log(dh), log(dw))] anchor bbox deltas.

    """

    # rpn match: 1 = positive anchor, -1 = negative anchor, 0 = neutral

    rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)

    # rpn bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]

    rpn_bbox = np.zeros((config.rpn_train_anchors_per_image, 4))
    # handle coco crowds

    # a crowd box in coco is a bounding box around several instances. exclude

    # them from training. a crowd box is given a negative class id.

    crowd_ix = np.where(gt_class_ids < 0)[0]

    if crowd_ix.shape[0] > 0:

        # filter out crowds from ground truth class ids and boxes

        non_crowd_ix = np.where(gt_class_ids > 0)[0]

        crowd_boxes = gt_boxes[crowd_ix]

        gt_class_ids = gt_class_ids[non_crowd_ix]

        gt_boxes = gt_boxes[non_crowd_ix]

        # compute overlaps with crowd boxes [anchors, crowds]

        crowd_overlaps = utils.compute_overlaps(anchors, crowd_boxes)

        crowd_iou_max = np.amax(crowd_overlaps, axis=1)

        no_crowd_bool = (crowd_iou_max < 0.001)

    else:

        # all anchors don't intersect a crowd

        no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
    # compute overlaps [num_anchors, num_gt_boxes]

    overlaps = utils.compute_overlaps(anchors, gt_boxes)
    # match anchors to gt boxes

    # if an anchor overlaps a gt box with iou >= 0.7 then it's positive.

    # if an anchor overlaps a gt box with iou < 0.3 then it's negative.

    # neutral anchors are those that don't match the conditions above,

    # and they don't influence the loss function.

    # however, don't keep any gt box unmatched (rare, but happens). instead,

    # match it to the closest anchor (even if its max iou is < 0.3).

    #

    # 1. set negative anchors first. they get overwritten below if a gt box is

    # matched to them. skip boxes in crowd areas.

    anchor_iou_argmax = np.argmax(overlaps, axis=1)

    anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]

    rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1

    # 2. set an anchor for each gt box (regardless of iou value).

    # if multiple anchors have the same iou match all of them

    gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:,0]

    rpn_match[gt_iou_argmax] = 1

    # 3. set anchors with high overlap as positive.

    rpn_match[anchor_iou_max >= 0.7] = 1
    # subsample to balance positive and negative anchors

    # don't let positives be more than half the anchors

    ids = np.where(rpn_match == 1)[0]

    extra = len(ids) - (config.rpn_train_anchors_per_image // 2)

    if extra > 0:

        # reset the extra ones to neutral

        ids = np.random.choice(ids, extra, replace=false)

        rpn_match[ids] = 0

    # same for negative proposals

    ids = np.where(rpn_match == -1)[0]

    extra = len(ids) - (config.rpn_train_anchors_per_image -

                        np.sum(rpn_match == 1))

    if extra > 0:

        # rest the extra ones to neutral

        ids = np.random.choice(ids, extra, replace=false)

        rpn_match[ids] = 0
    # for positive anchors, compute shift and scale needed to transform them

    # to match the corresponding gt boxes.

    ids = np.where(rpn_match == 1)[0]

    ix = 0  # index into rpn_bbox

    # todo: use box_refinement() rather than duplicating the code here

    for i, a in zip(ids, anchors[ids]):

        # closest gt box (it might have iou < 0.7)

        gt = gt_boxes[anchor_iou_argmax[i]]
        # convert coordinates to center plus width/height.

        # gt box

        gt_h = gt[2] - gt[0]

        gt_w = gt[3] - gt[1]

        gt_center_y = gt[0]   0.5 * gt_h

        gt_center_x = gt[1]   0.5 * gt_w

        # anchor

        a_h = a[2] - a[0]

        a_w = a[3] - a[1]

        a_center_y = a[0]   0.5 * a_h

        a_center_x = a[1]   0.5 * a_w
        # compute the bbox refinement that the rpn should predict.

        rpn_bbox[ix] = [

            (gt_center_y - a_center_y) / a_h,

            (gt_center_x - a_center_x) / a_w,

            np.log(gt_h / a_h),

            np.log(gt_w / a_w),

        ]

        # normalize

        rpn_bbox[ix] /= config.rpn_bbox_std_dev

        ix  = 1
    return rpn_match, rpn_bbox
def generate_random_rois(image_shape, count, gt_class_ids, gt_boxes):

    """generates roi proposals similar to what a region proposal network

    would generate.
    image_shape: [height, width, depth]

    count: number of rois to generate

    gt_class_ids: [n] integer ground truth class ids

    gt_boxes: [n, (y1, x1, y2, x2)] ground truth boxes in pixels.
    returns: [count, (y1, x1, y2, x2)] roi boxes in pixels.

    """

    # placeholder

    rois = np.zeros((count, 4), dtype=np.int32)
    # generate random rois around gt boxes (90% of count)

    rois_per_box = int(0.9 * count / gt_boxes.shape[0])

    for i in range(gt_boxes.shape[0]):

        gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]

        h = gt_y2 - gt_y1

        w = gt_x2 - gt_x1

        # random boundaries

        r_y1 = max(gt_y1 - h, 0)

        r_y2 = min(gt_y2   h, image_shape[0])

        r_x1 = max(gt_x1 - w, 0)

        r_x2 = min(gt_x2   w, image_shape[1])
        # to avoid generating boxes with zero area, we generate double what

        # we need and filter out the extra. if we get fewer valid boxes

        # than we need, we loop and try again.

        while true:

            y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))

            x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))

            # filter out zero area boxes

            threshold = 1

            y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=

                        threshold][:rois_per_box]

            x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=

                        threshold][:rois_per_box]

            if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:

                break
        # sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape

        # into x1, y1, x2, y2 order

        x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)

        y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)

        box_rois = np.hstack([y1, x1, y2, x2])

        rois[rois_per_box * i:rois_per_box * (i   1)] = box_rois
    # generate random rois anywhere in the image (10% of count)

    remaining_count = count - (rois_per_box * gt_boxes.shape[0])

    # to avoid generating boxes with zero area, we generate double what

    # we need and filter out the extra. if we get fewer valid boxes

    # than we need, we loop and try again.

    while true:

        y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))

        x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))

        # filter out zero area boxes

        threshold = 1

        y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=

                    threshold][:remaining_count]

        x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=

                    threshold][:remaining_count]

        if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:

            break
    # sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape

    # into x1, y1, x2, y2 order

    x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)

    y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)

    global_rois = np.hstack([y1, x1, y2, x2])

    rois[-remaining_count:] = global_rois

    return rois
def data_generator(dataset, config, shuffle=true, augment=false, augmentation=none,

                   random_rois=0, batch_size=1, detection_targets=false,

                   no_augmentation_sources=none):

    """a generator that returns images and corresponding target class ids,

    bounding box deltas, and masks.
    dataset: the dataset object to pick data from

    config: the model config object

    shuffle: if true, shuffles the samples before every epoch

    augment: (deprecated. use augmentation instead). if true, apply random

        image augmentation. currently, only horizontal flipping is offered.

    augmentation: optional. an imgaug (https://github.com/aleju/imgaug) augmentation.

        for example, passing imgaug.augmenters.fliplr(0.5) flips images

        right/left 50% of the time.

    random_rois: if > 0 then generate proposals to be used to train the

                 network classifier and mask heads. useful if training

                 the mask rcnn part without the rpn.

    batch_size: how many images to return in each call

    detection_targets: if true, generate detection targets (class ids, bbox

        deltas, and masks). typically for debugging or visualizations because

        in trainig detection targets are generated by detectiontargetlayer.

    no_augmentation_sources: optional. list of sources to exclude for

        augmentation. a source is string that identifies a dataset and is

        defined in the dataset class.
    returns a python generator. upon calling next() on it, the

    generator returns two lists, inputs and outputs. the contents

    of the lists differs depending on the received arguments:

    inputs list:

    - images: [batch, h, w, c]

    - image_meta: [batch, (meta data)] image details. see compose_image_meta()

    - rpn_match: [batch, n] integer (1=positive anchor, -1=negative, 0=neutral)

    - rpn_bbox: [batch, n, (dy, dx, log(dh), log(dw))] anchor bbox deltas.

    - gt_class_ids: [batch, max_gt_instances] integer class ids

    - gt_boxes: [batch, max_gt_instances, (y1, x1, y2, x2)]

    - gt_masks: [batch, height, width, max_gt_instances]. the height and width

                are those of the image unless use_mini_mask is true, in which

                case they are defined in mini_mask_shape.
    outputs list: usually empty in regular training. but if detection_targets

        is true then the outputs list contains target class_ids, bbox deltas,

        and masks.

    """

    b = 0  # batch item index

    image_index = -1

    image_ids = np.copy(dataset.image_ids)

    error_count = 0

    no_augmentation_sources = no_augmentation_sources or []
    # anchors

    # [anchor_count, (y1, x1, y2, x2)]

    backbone_shapes = compute_backbone_shapes(config, config.image_shape)

    anchors = utils.generate_pyramid_anchors(config.rpn_anchor_scales,

                                             config.rpn_anchor_ratios,

                                             backbone_shapes,

                                             config.backbone_strides,

                                             config.rpn_anchor_stride)
    # keras requires a generator to run indefinitely.

    while true:

        try:

            # increment index to pick next image. shuffle if at the start of an epoch.

            image_index = (image_index   1) % len(image_ids)

            if shuffle and image_index == 0:

                np.random.shuffle(image_ids)
            # get gt bounding boxes and masks for image.

            image_id = image_ids[image_index]
            # if the image source is not to be augmented pass none as augmentation

            if dataset.image_info[image_id]['source'] in no_augmentation_sources:

                image, image_meta, gt_class_ids, gt_boxes, gt_masks = \

                load_image_gt(dataset, config, image_id, augment=augment,

                              augmentation=none,

                              use_mini_mask=config.use_mini_mask)

            else:

                image, image_meta, gt_class_ids, gt_boxes, gt_masks = \

                    load_image_gt(dataset, config, image_id, augment=augment,

                                augmentation=augmentation,

                                use_mini_mask=config.use_mini_mask)
            # skip images that have no instances. this can happen in cases

            # where we train on a subset of classes and the image doesn't

            # have any of the classes we care about.

            if not np.any(gt_class_ids > 0):

                continue
            # rpn targets

            rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors,

                                                    gt_class_ids, gt_boxes, config)
            # mask r-cnn targets

            if random_rois:

                rpn_rois = generate_random_rois(

                    image.shape, random_rois, gt_class_ids, gt_boxes)

                if detection_targets:

                    rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask =\

                        build_detection_targets(

                            rpn_rois, gt_class_ids, gt_boxes, gt_masks, config)
            # init batch arrays

            if b == 0:

                batch_image_meta = np.zeros(

                    (batch_size,)   image_meta.shape, dtype=image_meta.dtype)

                batch_rpn_match = np.zeros(

                    [batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype)

                batch_rpn_bbox = np.zeros(

                    [batch_size, config.rpn_train_anchors_per_image, 4], dtype=rpn_bbox.dtype)

                batch_images = np.zeros(

                    (batch_size,)   image.shape, dtype=np.float32)

                batch_gt_class_ids = np.zeros(

                    (batch_size, config.max_gt_instances), dtype=np.int32)

                batch_gt_boxes = np.zeros(

                    (batch_size, config.max_gt_instances, 4), dtype=np.int32)

                batch_gt_masks = np.zeros(

                    (batch_size, gt_masks.shape[0], gt_masks.shape[1],

                     config.max_gt_instances), dtype=gt_masks.dtype)

                if random_rois:

                    batch_rpn_rois = np.zeros(

                        (batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype)

                    if detection_targets:

                        batch_rois = np.zeros(

                            (batch_size,)   rois.shape, dtype=rois.dtype)

                        batch_mrcnn_class_ids = np.zeros(

                            (batch_size,)   mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype)

                        batch_mrcnn_bbox = np.zeros(

                            (batch_size,)   mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype)

                        batch_mrcnn_mask = np.zeros(

                            (batch_size,)   mrcnn_mask.shape, dtype=mrcnn_mask.dtype)
            # if more instances than fits in the array, sub-sample from them.

            if gt_boxes.shape[0] > config.max_gt_instances:

                ids = np.random.choice(

                    np.arange(gt_boxes.shape[0]), config.max_gt_instances, replace=false)

                gt_class_ids = gt_class_ids[ids]

                gt_boxes = gt_boxes[ids]

                gt_masks = gt_masks[:, :, ids]
            # add to batch

            batch_image_meta[b] = image_meta

            batch_rpn_match[b] = rpn_match[:, np.newaxis]

            batch_rpn_bbox[b] = rpn_bbox

            batch_images[b] = mold_image(image.astype(np.float32), config)

            batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids

            batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes

            batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks

            if random_rois:

                batch_rpn_rois[b] = rpn_rois

                if detection_targets:

                    batch_rois[b] = rois

                    batch_mrcnn_class_ids[b] = mrcnn_class_ids

                    batch_mrcnn_bbox[b] = mrcnn_bbox

                    batch_mrcnn_mask[b] = mrcnn_mask

            b  = 1
            # batch full?

            if b >= batch_size:

                inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,

                          batch_gt_class_ids, batch_gt_boxes, batch_gt_masks]

                outputs = []
                if random_rois:

                    inputs.extend([batch_rpn_rois])

                    if detection_targets:

                        inputs.extend([batch_rois])

                        # keras requires that output and targets have the same number of dimensions

                        batch_mrcnn_class_ids = np.expand_dims(

                            batch_mrcnn_class_ids, -1)

                        outputs.extend(

                            [batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask])
                yield inputs, outputs
                # start a new batch

                b = 0

        except (generatorexit, keyboardinterrupt):

            raise

        except:

            # log it and skip the image

            logging.exception("error processing image {}".format(

                dataset.image_info[image_id]))

            error_count  = 1

            if error_count > 5:

                raise
############################################################

#  maskrcnn class

############################################################
class maskrcnn():

    """encapsulates the mask rcnn model functionality.
    the actual keras model is in the keras_model property.

    """
    def __init__(self, mode, config, model_dir):

        """

        mode: either "training" or "inference"

        config: a sub-class of the config class

        model_dir: directory to save training logs and trained weights

        """

        assert mode in ['training', 'inference']

        self.mode = mode

        self.config = config

        self.model_dir = model_dir

        self.set_log_dir()

        self.keras_model = self.build(mode=mode, config=config)
    def build(self, mode, config):

        """build mask r-cnn architecture.

            input_shape: the shape of the input image.

            mode: either "training" or "inference". the inputs and

                outputs of the model differ accordingly.

        """

        assert mode in ['training', 'inference']
        # image size must be dividable by 2 multiple times

        h, w = config.image_shape[:2]

        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):

            raise exception("image size must be dividable by 2 at least 6 times "

                            "to avoid fractions when downscaling and upscaling."

                            "for example, use 256, 320, 384, 448, 512, ... etc. ")
        # inputs

        input_image = kl.input(

            shape=[none, none, config.image_shape[2]], name="input_image")

        input_image_meta = kl.input(shape=[config.image_meta_size],

                                    name="input_image_meta")

        if mode == "training":

            # rpn gt

            input_rpn_match = kl.input(

                shape=[none, 1], name="input_rpn_match", dtype=tf.int32)

            input_rpn_bbox = kl.input(

                shape=[none, 4], name="input_rpn_bbox", dtype=tf.float32)
            # detection gt (class ids, bounding boxes, and masks)

            # 1. gt class ids (zero padded)

            input_gt_class_ids = kl.input(

                shape=[none], name="input_gt_class_ids", dtype=tf.int32)

            # 2. gt boxes in pixels (zero padded)

            # [batch, max_gt_instances, (y1, x1, y2, x2)] in image coordinates

            input_gt_boxes = kl.input(

                shape=[none, 4], name="input_gt_boxes", dtype=tf.float32)

            # normalize coordinates

            gt_boxes = kl.lambda(lambda x: norm_boxes_graph(

                x, k.shape(input_image)[1:3]))(input_gt_boxes)

            # 3. gt masks (zero padded)

            # [batch, height, width, max_gt_instances]

            if config.use_mini_mask:

                input_gt_masks = kl.input(

                    shape=[config.mini_mask_shape[0],

                           config.mini_mask_shape[1], none],

                    name="input_gt_masks", dtype=bool)

            else:

                input_gt_masks = kl.input(

                    shape=[config.image_shape[0], config.image_shape[1], none],

                    name="input_gt_masks", dtype=bool)

        elif mode == "inference":

            # anchors in normalized coordinates

            input_anchors = kl.input(shape=[none, 4], name="input_anchors")
        # build the shared convolutional layers.

        # bottom-up layers

        # returns a list of the last layers of each stage, 5 in total.

        # don't create the thead (stage 5), so we pick the 4th item in the list.

        if callable(config.backbone):

            _, c2, c3, c4, c5 = config.backbone(input_image, stage5=true,

                                                train_bn=config.train_bn)

        else:

            _, c2, c3, c4, c5 = resnet_graph(input_image, config.backbone,

                                             stage5=true, train_bn=config.train_bn)

        # top-down layers

        # todo: add assert to varify feature map sizes match what's in config

        p5 = kl.conv2d(config.top_down_pyramid_size, (1, 1), name='fpn_c5p5')(c5)

        p4 = kl.add(name="fpn_p4add")([

            kl.upsampling2d(size=(2, 2), name="fpn_p5upsampled")(p5),

            kl.conv2d(config.top_down_pyramid_size, (1, 1), name='fpn_c4p4')(c4)])

        p3 = kl.add(name="fpn_p3add")([

            kl.upsampling2d(size=(2, 2), name="fpn_p4upsampled")(p4),

            kl.conv2d(config.top_down_pyramid_size, (1, 1), name='fpn_c3p3')(c3)])

        p2 = kl.add(name="fpn_p2add")([

            kl.upsampling2d(size=(2, 2), name="fpn_p3upsampled")(p3),

            kl.conv2d(config.top_down_pyramid_size, (1, 1), name='fpn_c2p2')(c2)])

        # attach 3x3 conv to all p layers to get the final feature maps.

        p2 = kl.conv2d(config.top_down_pyramid_size, (3, 3), padding="same", name="fpn_p2")(p2)

        p3 = kl.conv2d(config.top_down_pyramid_size, (3, 3), padding="same", name="fpn_p3")(p3)

        p4 = kl.conv2d(config.top_down_pyramid_size, (3, 3), padding="same", name="fpn_p4")(p4)

        p5 = kl.conv2d(config.top_down_pyramid_size, (3, 3), padding="same", name="fpn_p5")(p5)

        # p6 is used for the 5th anchor scale in rpn. generated by

        # subsampling from p5 with stride of 2.

        p6 = kl.maxpooling2d(pool_size=(1, 1), strides=2, name="fpn_p6")(p5)
        # note that p6 is used in rpn, but not in the classifier heads.

        rpn_feature_maps = [p2, p3, p4, p5, p6]

        mrcnn_feature_maps = [p2, p3, p4, p5]
        # anchors

        if mode == "training":

            anchors = self.get_anchors(config.image_shape)

            # duplicate across the batch dimension because keras requires it

            # todo: can this be optimized to avoid duplicating the anchors?

            anchors = np.broadcast_to(anchors, (config.batch_size,)   anchors.shape)

            # a hack to get around keras's bad support for constants

            anchors = kl.lambda(lambda x: tf.variable(anchors), name="anchors")(input_image)

        else:

            anchors = input_anchors
        # rpn model

        rpn = build_rpn_model(config.rpn_anchor_stride,

                              len(config.rpn_anchor_ratios), config.top_down_pyramid_size)

        # loop through pyramid layers

        layer_outputs = []  # list of lists

        for p in rpn_feature_maps:

            layer_outputs.append(rpn([p]))

        # concatenate layer outputs

        # convert from list of lists of level outputs to list of lists

        # of outputs across levels.

        # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]

        output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]

        outputs = list(zip(*layer_outputs))

        outputs = [kl.concatenate(axis=1, name=n)(list(o))

                   for o, n in zip(outputs, output_names)]
        rpn_class_logits, rpn_class, rpn_bbox = outputs
        # generate proposals

        # proposals are [batch, n, (y1, x1, y2, x2)] in normalized coordinates

        # and zero padded.

        proposal_count = config.post_nms_rois_training if mode == "training"\

            else config.post_nms_rois_inference

        rpn_rois = proposallayer(

            proposal_count=proposal_count,

            nms_threshold=config.rpn_nms_threshold,

            name="roi",

            config=config)([rpn_class, rpn_bbox, anchors])
        if mode == "training":

            # class id mask to mark class ids supported by the dataset the image

            # came from.

            active_class_ids = kl.lambda(

                lambda x: parse_image_meta_graph(x)["active_class_ids"]

                )(input_image_meta)
            if not config.use_rpn_rois:

                # ignore predicted rois and use rois provided as an input.

                input_rois = kl.input(shape=[config.post_nms_rois_training, 4],

                                      name="input_roi", dtype=np.int32)

                # normalize coordinates

                target_rois = kl.lambda(lambda x: norm_boxes_graph(

                    x, k.shape(input_image)[1:3]))(input_rois)

            else:

                target_rois = rpn_rois
            # generate detection targets

            # subsamples proposals and generates target outputs for training

            # note that proposal class ids, gt_boxes, and gt_masks are zero

            # padded. equally, returned rois and targets are zero padded.

            rois, target_class_ids, target_bbox, target_mask =\

                detectiontargetlayer(config, name="proposal_targets")([

                    target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])
            # network heads

            # todo: verify that this handles zero padded rois

            mrcnn_class_logits, mrcnn_class, mrcnn_bbox, _ =\

                fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,

                                     config.pool_size, config.num_classes,

                                     train_bn=config.train_bn,

                                     fc_layers_size=config.fpn_classif_fc_layers_size)
            mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps,

                                              input_image_meta,

                                              config.mask_pool_size,

                                              config.num_classes,

                                              train_bn=config.train_bn)
            # todo: clean up (use tf.identify if necessary)

            output_rois = kl.lambda(lambda x: x * 1, name="output_rois")(rois)
            # losses

            rpn_class_loss = kl.lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")(

                [input_rpn_match, rpn_class_logits])

            rpn_bbox_loss = kl.lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(

                [input_rpn_bbox, input_rpn_match, rpn_bbox])

            class_loss = kl.lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(

                [target_class_ids, mrcnn_class_logits, active_class_ids])

            bbox_loss = kl.lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(

                [target_bbox, target_class_ids, mrcnn_bbox])

            mask_loss = kl.lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")(

                [target_mask, target_class_ids, mrcnn_mask])
            # model

            inputs = [input_image, input_image_meta,

                      input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]

            if not config.use_rpn_rois:

                inputs.append(input_rois)

            outputs = [rpn_class_logits, rpn_class, rpn_bbox,

                       mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,

                       rpn_rois, output_rois,

                       rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]

            model = km.model(inputs, outputs, name='mask_rcnn')

        else:

            # network heads

            # proposal classifier and bbox regressor heads

            mrcnn_class_logits, mrcnn_class, mrcnn_bbox, feature_maps =\

                fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,

                                     config.pool_size, config.num_classes,

                                     train_bn=config.train_bn,

                                     fc_layers_size=config.fpn_classif_fc_layers_size)
            # detections

            # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in

            # normalized coordinates

            detections = detectionlayer(config, name="mrcnn_detection")(

                [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta, feature_maps])
            # create masks for detections

            detection_boxes = kl.lambda(lambda x: x[..., :4])(detections)

            mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps,

                                              input_image_meta,

                                              config.mask_pool_size,

                                              config.num_classes,

                                              train_bn=config.train_bn)
            model = km.model([input_image, input_image_meta, input_anchors],

                             [detections, mrcnn_class, mrcnn_bbox,

                                 mrcnn_mask, rpn_rois, rpn_class, rpn_bbox],

                             name='mask_rcnn')
        # add multi-gpu support.

        if config.gpu_count > 1:

            from mrcnn.parallel_model import parallelmodel

            model = parallelmodel(model, config.gpu_count)
        return model
    def find_last(self):

        """finds the last checkpoint file of the last trained model in the

        model directory.

        returns:

            the path of the last checkpoint file

        """

        # get directory names. each directory corresponds to a model

        dir_names = next(os.walk(self.model_dir))[1]

        key = self.config.name.lower()

        dir_names = filter(lambda f: f.startswith(key), dir_names)

        dir_names = sorted(dir_names)

        if not dir_names:

            import errno

            raise filenotfounderror(

                errno.enoent,

                "could not find model directory under {}".format(self.model_dir))

        # pick last directory

        dir_name = os.path.join(self.model_dir, dir_names[-1])

        # find the last checkpoint

        checkpoints = next(os.walk(dir_name))[2]

        checkpoints = filter(lambda f: f.startswith("mask_rcnn"), checkpoints)

        checkpoints = sorted(checkpoints)

        if not checkpoints:

            import errno

            raise filenotfounderror(

                errno.enoent, "could not find weight files in {}".format(dir_name))

        checkpoint = os.path.join(dir_name, checkpoints[-1])

        return checkpoint
    def load_weights(self, filepath, by_name=false, exclude=none):

        """modified version of the corresponding keras function with

        the addition of multi-gpu support and the ability to exclude

        some layers from loading.

        exclude: list of layer names to exclude

        """

        import h5py

        # conditional import to support versions of keras before 2.2

        # todo: remove in about 6 months (end of 2018)

        try:

            from keras.engine import saving

        except importerror:

            # keras before 2.2 used the 'topology' namespace.

            from keras.engine import topology as saving
        if exclude:

            by_name = true
        if h5py is none:

            raise importerror('`load_weights` requires h5py.')

        f = h5py.file(filepath, mode='r')

        if 'layer_names' not in f.attrs and 'model_weights' in f:

            f = f['model_weights']
        # in multi-gpu training, we wrap the model. get layers

        # of the inner model because they have the weights.

        keras_model = self.keras_model

        layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\

            else keras_model.layers
        # exclude some layers

        if exclude:

            layers = filter(lambda l: l.name not in exclude, layers)
        if by_name:

            saving.load_weights_from_hdf5_group_by_name(f, layers)

        else:

            saving.load_weights_from_hdf5_group(f, layers)

        if hasattr(f, 'close'):

            f.close()
        # update the log directory

        self.set_log_dir(filepath)
    def get_imagenet_weights(self):

        """downloads imagenet trained weights from keras.

        returns path to weights file.

        """

        from keras.utils.data_utils import get_file

        tf_weights_path_no_top = 'https://github.com/fchollet/deep-learning-models/'\

                                 'releases/download/v0.2/'\

                                 'resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'

        weights_path = get_file('resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',

                                tf_weights_path_no_top,

                                cache_subdir='models',

                                md5_hash='a268eb855778b3df3c7506639542a6af')

        return weights_path
    def compile(self, learning_rate, momentum):

        """gets the model ready for training. adds losses, regularization, and

        metrics. then calls the keras compile() function.

        """

        # optimizer object

        optimizer = keras.optimizers.sgd(

            lr=learning_rate, momentum=momentum,

            clipnorm=self.config.gradient_clip_norm)

        # add losses

        # first, clear previously set losses to avoid duplication

        self.keras_model._losses = []

        self.keras_model._per_input_losses = {}

        loss_names = [

            "rpn_class_loss",  "rpn_bbox_loss",

            "mrcnn_class_loss", "mrcnn_bbox_loss", "mrcnn_mask_loss"]

        for name in loss_names:

            layer = self.keras_model.get_layer(name)

            if layer.output in self.keras_model.losses:

                continue

            loss = (

                tf.reduce_mean(layer.output, keepdims=true)

                * self.config.loss_weights.get(name, 1.))

            self.keras_model.add_loss(loss)
        # add l2 regularization

        # skip gamma and beta weights of batch normalization layers.

        reg_losses = [

            keras.regularizers.l2(self.config.weight_decay)(w) / tf.cast(tf.size(w), tf.float32)

            for w in self.keras_model.trainable_weights

            if 'gamma' not in w.name and 'beta' not in w.name]

        self.keras_model.add_loss(tf.add_n(reg_losses))
        # compile

        self.keras_model.compile(

            optimizer=optimizer,

            loss=[none] * len(self.keras_model.outputs))
        # add metrics for losses

        for name in loss_names:

            if name in self.keras_model.metrics_names:

                continue

            layer = self.keras_model.get_layer(name)

            self.keras_model.metrics_names.append(name)

            loss = (

                tf.reduce_mean(layer.output, keepdims=true)

                * self.config.loss_weights.get(name, 1.))

            self.keras_model.metrics_tensors.append(loss)
    def set_trainable(self, layer_regex, keras_model=none, indent=0, verbose=1):

        """sets model layers as trainable if their names match

        the given regular expression.

        """

        # print message on the first call (but not on recursive calls)

        if verbose > 0 and keras_model is none:

            log("selecting layers to train")
        keras_model = keras_model or self.keras_model
        # in multi-gpu training, we wrap the model. get layers

        # of the inner model because they have the weights.

        layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")\

            else keras_model.layers
        for layer in layers:

            # is the layer a model?

            if layer.__class__.__name__ == 'model':

                print("in model: ", layer.name)

                self.set_trainable(

                    layer_regex, keras_model=layer, indent=indent   4)

                continue
            if not layer.weights:

                continue

            # is it trainable?

            trainable = bool(re.fullmatch(layer_regex, layer.name))

            # update layer. if layer is a container, update inner layer.

            if layer.__class__.__name__ == 'timedistributed':

                layer.layer.trainable = trainable

            else:

                layer.trainable = trainable

            # print trainable layer names

            if trainable and verbose > 0:

                log("{}{:20}   ({})".format(" " * indent, layer.name,

                                            layer.__class__.__name__))
    def set_log_dir(self, model_path=none):

        """sets the model log directory and epoch counter.
        model_path: if none, or a format different from what this code uses

            then set a new log directory and start epochs from 0. otherwise,

            extract the log directory and the epoch counter from the file

            name.

        """

        # set date and epoch counter as if starting a new model

        self.epoch = 0

        now = datetime.datetime.now()
        # if we have a model path with date and epochs use them

        if model_path:

            # continue from we left of. get epoch and date from the file name

            # a sample model path might look like:

            # \path\to\logs\coco20171029t2315\mask_rcnn_coco_0001.h5 (windows)

            # /path/to/logs/coco20171029t2315/mask_rcnn_coco_0001.h5 (linux)

            regex = r".*[/\\][\w-] (\d{4})(\d{2})(\d{2})t(\d{2})(\d{2})[/\\]mask\_rcnn\_[\w-] (\d{4})\.h5"

            m = re.match(regex, model_path)

            if m:

                now = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),

                                        int(m.group(4)), int(m.group(5)))

                # epoch number in file is 1-based, and in keras code it's 0-based.

                # so, adjust for that then increment by one to start from the next epoch

                self.epoch = int(m.group(6)) - 1   1

                print('re-starting from epoch %d' % self.epoch)
        # directory for training logs

        self.log_dir = os.path.join(self.model_dir, "{}{:%y%m%dt%h%m}".format(

            self.config.name.lower(), now))
        # path to save after each epoch. include placeholders that get filled by keras.

        self.checkpoint_path = os.path.join(self.log_dir, "mask_rcnn_{}_*epoch*.h5".format(

            self.config.name.lower()))

        self.checkpoint_path = self.checkpoint_path.replace(

            "*epoch*", "{epoch:04d}")
    def train(self, train_dataset, val_dataset, learning_rate, epochs, layers,

              augmentation=none, custom_callbacks=none, no_augmentation_sources=none):

        """train the model.

        train_dataset, val_dataset: training and validation dataset objects.

        learning_rate: the learning rate to train with

        epochs: number of training epochs. note that previous training epochs

                are considered to be done alreay, so this actually determines

                the epochs to train in total rather than in this particaular

                call.

        layers: allows selecting wich layers to train. it can be:

            - a regular expression to match layer names to train

            - one of these predefined values:

              heads: the rpn, classifier and mask heads of the network

              all: all the layers

              3 : train resnet stage 3 and up

              4 : train resnet stage 4 and up

              5 : train resnet stage 5 and up

        augmentation: optional. an imgaug (https://github.com/aleju/imgaug)

            augmentation. for example, passing imgaug.augmenters.fliplr(0.5)

            flips images right/left 50% of the time. you can pass complex

            augmentations as well. this augmentation applies 50% of the

            time, and when it does it flips images right/left half the time

            and adds a gaussian blur with a random sigma in range 0 to 5.
                augmentation = imgaug.augmenters.sometimes(0.5, [

                    imgaug.augmenters.fliplr(0.5),

                    imgaug.augmenters.gaussianblur(sigma=(0.0, 5.0))

                ])

        custom_callbacks: optional. add custom callbacks to be called

        with the keras fit_generator method. must be list of type keras.callbacks.

        no_augmentation_sources: optional. list of sources to exclude for

            augmentation. a source is string that identifies a dataset and is

            defined in the dataset class.

        """

        assert self.mode == "training", "create model in training mode."
        # pre-defined layer regular expressions

        layer_regex = {

            # all layers but the backbone

            "heads": r"(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)",

            # from a specific resnet stage and up

            "3 ": r"(res3.*)|(bn3.*)|(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)",

            "4 ": r"(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)",

            "5 ": r"(res5.*)|(bn5.*)|(mrcnn\_.*)|(rpn\_.*)|(fpn\_.*)",

            # all layers

            "all": ".*",

        }

        if layers in layer_regex.keys():

            layers = layer_regex[layers]
        # data generators

        train_generator = data_generator(train_dataset, self.config, shuffle=true,

                                         augmentation=augmentation,

                                         batch_size=self.config.batch_size,

                                         no_augmentation_sources=no_augmentation_sources)

        val_generator = data_generator(val_dataset, self.config, shuffle=true,

                                       batch_size=self.config.batch_size)
        # create log_dir if it does not exist

        if not os.path.exists(self.log_dir):

            os.makedirs(self.log_dir)
        # callbacks

        callbacks = [

            keras.callbacks.tensorboard(log_dir=self.log_dir,

                                        histogram_freq=0, write_graph=true, write_images=false),

            keras.callbacks.modelcheckpoint(self.checkpoint_path,

                                            verbose=0, save_weights_only=true),

        ]
        # add custom callbacks to the list

        if custom_callbacks:

            callbacks  = custom_callbacks
        # train

        log("\nstarting at epoch {}. lr={}\n".format(self.epoch, learning_rate))

        log("checkpoint path: {}".format(self.checkpoint_path))

        self.set_trainable(layers)

        self.compile(learning_rate, self.config.learning_momentum)
        # work-around for windows: keras fails on windows when using

        # multiprocessing workers. see discussion here:

        # https://github.com/matterport/mask_rcnn/issues/13#issuecomment-353124009

        if os.name is 'nt':

            workers = 0

        else:

            workers = multiprocessing.cpu_count()
        self.keras_model.fit_generator(

            train_generator,

            initial_epoch=self.epoch,

            epochs=epochs,

            steps_per_epoch=self.config.steps_per_epoch,

            callbacks=callbacks,

            validation_data=val_generator,

            validation_steps=self.config.validation_steps,

            max_queue_size=100,

            workers=workers,

            use_multiprocessing=true,

        )

        self.epoch = max(self.epoch, epochs)
    def mold_inputs(self, images):

        """takes a list of images and modifies them to the format expected

        as an input to the neural network.

        images: list of image matrices [height,width,depth]. images can have

            different sizes.
        returns 3 numpy matrices:

        molded_images: [n, h, w, 3]. images resized and normalized.

        image_metas: [n, length of meta data]. details about each image.

        windows: [n, (y1, x1, y2, x2)]. the portion of the image that has the

            original image (padding excluded).

        """

        molded_images = []

        image_metas = []

        windows = []

        for image in images:

            # resize image

            # todo: move resizing to mold_image()

            molded_image, window, scale, padding, crop = utils.resize_image(

                image,

                min_dim=self.config.image_min_dim,

                min_scale=self.config.image_min_scale,

                max_dim=self.config.image_max_dim,

                mode=self.config.image_resize_mode)

            molded_image = mold_image(molded_image, self.config)

            # build image_meta

            image_meta = compose_image_meta(

                0, image.shape, molded_image.shape, window, scale,

                np.zeros([self.config.num_classes], dtype=np.int32))

            # append

            molded_images.append(molded_image)

            windows.append(window)

            image_metas.append(image_meta)

        # pack into arrays

        molded_images = np.stack(molded_images)

        image_metas = np.stack(image_metas)

        windows = np.stack(windows)

        return molded_images, image_metas, windows
    def unmold_detections(self, detections, mrcnn_mask, original_image_shape,

                          image_shape, window):

        """reformats the detections of one image from the format of the neural

        network output to a format suitable for use in the rest of the

        application.
        detections: [n, (y1, x1, y2, x2, class_id, score)] in normalized coordinates

        mrcnn_mask: [n, height, width, num_classes]

        original_image_shape: [h, w, c] original image shape before resizing

        image_shape: [h, w, c] shape of the image after resizing and padding

        window: [y1, x1, y2, x2] pixel coordinates of box in the image where the real

                image is excluding the padding.
        returns:

        boxes: [n, (y1, x1, y2, x2)] bounding boxes in pixels

        class_ids: [n] integer class ids for each bounding box

        scores: [n] float probability scores of the class_id

        masks: [height, width, num_instances] instance masks

        """

        # how many detections do we have?

        # detections array is padded with zeros. find the first class_id == 0.

        zero_ix = np.where(detections[:, 4] == 0)[0]

        n = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0]
        # extract boxes, class_ids, scores, and class-specific masks

        boxes = detections[:n, :4]

        class_ids = detections[:n, 4].astype(np.int32)

        scores = detections[:n, 5]

        features = detections[:n, 6:]

        masks = mrcnn_mask[np.arange(n), :, :, class_ids]
        # translate normalized coordinates in the resized image to pixel

        # coordinates in the original image before resizing

        window = utils.norm_boxes(window, image_shape[:2])

        wy1, wx1, wy2, wx2 = window

        shift = np.array([wy1, wx1, wy1, wx1])

        wh = wy2 - wy1  # window height

        ww = wx2 - wx1  # window width

        scale = np.array([wh, ww, wh, ww])

        # convert boxes to normalized coordinates on the window

        boxes = np.divide(boxes - shift, scale)

        # convert boxes to pixel coordinates on the original image

        boxes = utils.denorm_boxes(boxes, original_image_shape[:2])
        # filter out detections with zero area. happens in early training when

        # network weights are still random

        exclude_ix = np.where(

            (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]

        if exclude_ix.shape[0] > 0:

            boxes = np.delete(boxes, exclude_ix, axis=0)

            class_ids = np.delete(class_ids, exclude_ix, axis=0)

            scores = np.delete(scores, exclude_ix, axis=0)

            masks = np.delete(masks, exclude_ix, axis=0)

            n = class_ids.shape[0]
        # resize masks to original image size and set boundary threshold.

        full_masks = []

        for i in range(n):

            # convert neural network mask to full size mask

            full_mask = utils.unmold_mask(masks[i], boxes[i], original_image_shape)

            full_masks.append(full_mask)

        full_masks = np.stack(full_masks, axis=-1)\

            if full_masks else np.empty(original_image_shape[:2]   (0,))
        return boxes, class_ids, scores, full_masks, features
    def detect(self, images, verbose=0):

        """runs the detection pipeline.
        images: list of images, potentially of different sizes.
        returns a list of dicts, one dict per image. the dict contains:

        rois: [n, (y1, x1, y2, x2)] detection bounding boxes

        class_ids: [n] int class ids

        scores: [n] float probability scores for the class ids

        masks: [h, w, n] instance binary masks

        """

        assert self.mode == "inference", "create model in inference mode."

        assert len(

            images) == self.config.batch_size, "len(images) must be equal to batch_size"
        if verbose:

            log("processing {} images".format(len(images)))

            for image in images:

                log("image", image)
        # mold inputs to format expected by the neural network

        molded_images, image_metas, windows = self.mold_inputs(images)
        # validate image sizes

        # all images in a batch must be of the same size

        image_shape = molded_images[0].shape

        for g in molded_images[1:]:

            assert g.shape == image_shape,\

                "after resizing, all images must have the same size. check image_resize_mode and image sizes."
        # anchors

        anchors = self.get_anchors(image_shape)

        # duplicate across the batch dimension because keras requires it

        # todo: can this be optimized to avoid duplicating the anchors?

        anchors = np.broadcast_to(anchors, (self.config.batch_size,)   anchors.shape)
        if verbose:

            log("molded_images", molded_images)

            log("image_metas", image_metas)

            log("anchors", anchors)

        # run object detection

        detections, _, _, mrcnn_mask, _, _, _ =\

            self.keras_model.predict([molded_images, image_metas, anchors], verbose=0)

        # process detections

        results = []

        for i, image in enumerate(images):

            final_rois, final_class_ids, final_scores, final_masks, features =\

                self.unmold_detections(detections[i], mrcnn_mask[i],

                                       image.shape, molded_images[i].shape,

                                       windows[i])

            results.append({

                "rois": final_rois,

                "class_ids": final_class_ids,

                "scores": final_scores,

                "masks": final_masks,

                "features": features,

            })

        return results
    def detect_molded(self, molded_images, image_metas, verbose=0):

        """runs the detection pipeline, but expect inputs that are

        molded already. used mostly for debugging and inspecting

        the model.
        molded_images: list of images loaded using load_image_gt()

        image_metas: image meta data, also returned by load_image_gt()
        returns a list of dicts, one dict per image. the dict contains:

        rois: [n, (y1, x1, y2, x2)] detection bounding boxes

        class_ids: [n] int class ids

        scores: [n] float probability scores for the class ids

        masks: [h, w, n] instance binary masks

        """

        assert self.mode == "inference", "create model in inference mode."

        assert len(molded_images) == self.config.batch_size,\

            "number of images must be equal to batch_size"
        if verbose:

            log("processing {} images".format(len(molded_images)))

            for image in molded_images:

                log("image", image)
        # validate image sizes

        # all images in a batch must be of the same size

        image_shape = molded_images[0].shape

        for g in molded_images[1:]:

            assert g.shape == image_shape, "images must have the same size"
        # anchors

        anchors = self.get_anchors(image_shape)

        # duplicate across the batch dimension because keras requires it

        # todo: can this be optimized to avoid duplicating the anchors?

        anchors = np.broadcast_to(anchors, (self.config.batch_size,)   anchors.shape)
        if verbose:

            log("molded_images", molded_images)

            log("image_metas", image_metas)

            log("anchors", anchors)

        # run object detection

        detections, _, _, mrcnn_mask, _, _, _ =\

            self.keras_model.predict([molded_images, image_metas, anchors], verbose=0)

        # process detections

        results = []

        for i, image in enumerate(molded_images):

            window = [0, 0, image.shape[0], image.shape[1]]

            final_rois, final_class_ids, final_scores, final_masks =\

                self.unmold_detections(detections[i], mrcnn_mask[i],

                                       image.shape, molded_images[i].shape,

                                       window)

            results.append({

                "rois": final_rois,

                "class_ids": final_class_ids,

                "scores": final_scores,

                "masks": final_masks,

            })

        return results
    def get_anchors(self, image_shape):

        """returns anchor pyramid for the given image size."""

        backbone_shapes = compute_backbone_shapes(self.config, image_shape)

        # cache anchors and reuse if image shape is the same

        if not hasattr(self, "_anchor_cache"):

            self._anchor_cache = {}

        if not tuple(image_shape) in self._anchor_cache:

            # generate anchors

            a = utils.generate_pyramid_anchors(

                self.config.rpn_anchor_scales,

                self.config.rpn_anchor_ratios,

                backbone_shapes,

                self.config.backbone_strides,

                self.config.rpn_anchor_stride)

            # keep a copy of the latest anchors in pixel coordinates because

            # it's used in inspect_model notebooks.

            # todo: remove this after the notebook are refactored to not use it

            self.anchors = a

            # normalize coordinates

            self._anchor_cache[tuple(image_shape)] = utils.norm_boxes(a, image_shape[:2])

        return self._anchor_cache[tuple(image_shape)]
    def ancestor(self, tensor, name, checked=none):

        """finds the ancestor of a tf tensor in the computation graph.

        tensor: tensorflow symbolic tensor.

        name: name of ancestor tensor to find

        checked: for internal use. a list of tensors that were already

                 searched to avoid loops in traversing the graph.

        """

        checked = checked if checked is not none else []

        # put a limit on how deep we go to avoid very long loops

        if len(checked) > 500:

            return none

        # convert name to a regex and allow matching a number prefix

        # because keras adds them automatically

        if isinstance(name, str):

            name = re.compile(name.replace("/", r"(\_\d )*/"))
        parents = tensor.op.inputs

        for p in parents:

            if p in checked:

                continue

            if bool(re.fullmatch(name, p.name)):

                return p

            checked.append(p)

            a = self.ancestor(p, name, checked)

            if a is not none:

                return a

        return none
    def find_trainable_layer(self, layer):

        """if a layer is encapsulated by another layer, this function

        digs through the encapsulation and returns the layer that holds

        the weights.

        """

        if layer.__class__.__name__ == 'timedistributed':

            return self.find_trainable_layer(layer.layer)

        return layer
    def get_trainable_layers(self):

        """returns a list of layers that have weights."""

        layers = []

        # loop through all layers

        for l in self.keras_model.layers:

            # if layer is a wrapper, find inner trainable layer

            l = self.find_trainable_layer(l)

            # include layer if it has weights

            if l.get_weights():

                layers.append(l)

        return layers
    def run_graph(self, images, outputs, image_metas=none):

        """runs a sub-set of the computation graph that computes the given

        outputs.
        image_metas: if provided, the images are assumed to be already

            molded (i.e. resized, padded, and normalized)
        outputs: list of tuples (name, tensor) to compute. the tensors are

            symbolic tensorflow tensors and the names are for easy tracking.
        returns an ordered dict of results. keys are the names received in the

        input and values are numpy arrays.

        """

        model = self.keras_model
        # organize desired outputs into an ordered dict

        outputs = ordereddict(outputs)

        for o in outputs.values():

            assert o is not none
        # build a keras function to run parts of the computation graph

        inputs = model.inputs

        if model.uses_learning_phase and not isinstance(k.learning_phase(), int):

            inputs  = [k.learning_phase()]

        kf = k.function(model.inputs, list(outputs.values()))
        # prepare inputs

        if image_metas is none:

            molded_images, image_metas, _ = self.mold_inputs(images)

        else:

            molded_images = images

        image_shape = molded_images[0].shape

        # anchors

        anchors = self.get_anchors(image_shape)

        # duplicate across the batch dimension because keras requires it

        # todo: can this be optimized to avoid duplicating the anchors?

        anchors = np.broadcast_to(anchors, (self.config.batch_size,)   anchors.shape)

        model_in = [molded_images, image_metas, anchors]
        # run inference

        if model.uses_learning_phase and not isinstance(k.learning_phase(), int):

            model_in.append(0.)

        outputs_np = kf(model_in)
        # pack the generated numpy arrays into a a dict and log the results.

        outputs_np = ordereddict([(k, v)

                                  for k, v in zip(outputs.keys(), outputs_np)])

        for k, v in outputs_np.items():

            log(k, v)

        return outputs_np
############################################################

#  data formatting

############################################################
def compose_image_meta(image_id, original_image_shape, image_shape,

                       window, scale, active_class_ids):

    """takes attributes of an image and puts them in one 1d array.
    image_id: an int id of the image. useful for debugging.

    original_image_shape: [h, w, c] before resizing or padding.

    image_shape: [h, w, c] after resizing and padding

    window: (y1, x1, y2, x2) in pixels. the area of the image where the real

            image is (excluding the padding)

    scale: the scaling factor applied to the original image (float32)

    active_class_ids: list of class_ids available in the dataset from which

        the image came. useful if training on images from multiple datasets

        where not all classes are present in all datasets.

    """

    meta = np.array(

        [image_id]                    # size=1

        list(original_image_shape)    # size=3

        list(image_shape)             # size=3

        list(window)                  # size=4 (y1, x1, y2, x2) in image cooredinates

        [scale]                       # size=1

        list(active_class_ids)        # size=num_classes

    )

    return meta
def parse_image_meta(meta):

    """parses an array that contains image attributes to its components.

    see compose_image_meta() for more details.
    meta: [batch, meta length] where meta length depends on num_classes
    returns a dict of the parsed values.

    """

    image_id = meta[:, 0]

    original_image_shape = meta[:, 1:4]

    image_shape = meta[:, 4:7]

    window = meta[:, 7:11]  # (y1, x1, y2, x2) window of image in in pixels

    scale = meta[:, 11]

    active_class_ids = meta[:, 12:]

    return {

        "image_id": image_id.astype(np.int32),

        "original_image_shape": original_image_shape.astype(np.int32),

        "image_shape": image_shape.astype(np.int32),

        "window": window.astype(np.int32),

        "scale": scale.astype(np.float32),

        "active_class_ids": active_class_ids.astype(np.int32),

    }
def parse_image_meta_graph(meta):

    """parses a tensor that contains image attributes to its components.

    see compose_image_meta() for more details.
    meta: [batch, meta length] where meta length depends on num_classes
    returns a dict of the parsed tensors.

    """

    image_id = meta[:, 0]

    original_image_shape = meta[:, 1:4]

    image_shape = meta[:, 4:7]

    window = meta[:, 7:11]  # (y1, x1, y2, x2) window of image in in pixels

    scale = meta[:, 11]

    active_class_ids = meta[:, 12:]

    return {

        "image_id": image_id,

        "original_image_shape": original_image_shape,

        "image_shape": image_shape,

        "window": window,

        "scale": scale,

        "active_class_ids": active_class_ids,

    }
def mold_image(images, config):

    """expects an rgb image (or array of images) and subtracts

    the mean pixel and converts it to float. expects image

    colors in rgb order.

    """

    return images.astype(np.float32) - config.mean_pixel
def unmold_image(normalized_images, config):

    """takes a image normalized with mold() and returns the original."""

    return (normalized_images   config.mean_pixel).astype(np.uint8)
############################################################

#  miscellenous graph functions

############################################################
def trim_zeros_graph(boxes, name='trim_zeros'):

    """often boxes are represented with matrices of shape [n, 4] and

    are padded with zeros. this removes zero boxes.
    boxes: [n, 4] matrix of boxes.

    non_zeros: [n] a 1d boolean mask identifying the rows to keep

    """

    non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)

    boxes = tf.boolean_mask(boxes, non_zeros, name=name)

    return boxes, non_zeros
def batch_pack_graph(x, counts, num_rows):

    """picks different number of values from each row

    in x depending on the values in counts.

    """

    outputs = []

    for i in range(num_rows):

        outputs.append(x[i, :counts[i]])

    return tf.concat(outputs, axis=0)
def norm_boxes_graph(boxes, shape):

    """converts boxes from pixel coordinates to normalized coordinates.

    boxes: [..., (y1, x1, y2, x2)] in pixel coordinates

    shape: [..., (height, width)] in pixels
    note: in pixel coordinates (y2, x2) is outside the box. but in normalized

    coordinates it's inside the box.
    returns:

        [..., (y1, x1, y2, x2)] in normalized coordinates

    """

    h, w = tf.split(tf.cast(shape, tf.float32), 2)

    scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)

    shift = tf.constant([0., 0., 1., 1.])

    return tf.divide(boxes - shift, scale)
def denorm_boxes_graph(boxes, shape):

    """converts boxes from normalized coordinates to pixel coordinates.

    boxes: [..., (y1, x1, y2, x2)] in normalized coordinates

    shape: [..., (height, width)] in pixels
    note: in pixel coordinates (y2, x2) is outside the box. but in normalized

    coordinates it's inside the box.
    returns:

        [..., (y1, x1, y2, x2)] in pixel coordinates

    """

    h, w = tf.split(tf.cast(shape, tf.float32), 2)

    scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)

    shift = tf.constant([0., 0., 1., 1.])

    return tf.cast(tf.round(tf.multiply(boxes, scale)   shift), tf.int32)

model.py

复制上述代码，替换 /path/to/mask_rcnn-master/mask_rcnn-master/mrcnn/model.py 文件中的全部内容，请注意保留原 model.py 文件！

　　在新的 model.py 文件中，第2485行找到 detect() 函数，在函数注释后添加如下代码，提示函数调用：

print('function detect() is running...')

　　在第2531行 for 循环结束后，添加如下代码，每个 region 的 feature map：

        print(results)

        print(len(results[0]['features']))

在 anaconda prompt 中，重新加载 jupyter notebook，运行 demo.ipynb 文件，运行结果如下：

[tensorflow] 使用 mask-凯发真人娱乐

[tensorflow] 使用 mask_rcnn 完成目标检测与实例分割，同时输出每个区域的 feature map的相关教程结束。

相关推荐

通过xtrabackup实现mysql实例的全库备份与按需单库恢复

centos7安装openstack(rocky版)-08.启动一个虚拟机实例

多加速器驱动agx的目标检测与车道分割

cvpr目标检测与实例分割算法解析：fcos（2019），mask r-cnn（2019），polarmask（2020）

[opencv实战]13 opencv中使用mask r-cnn进行对象检测和实例分割