一.定义一个RCNN的类作为基础
1. self.build_network(self,sess,is_training=True)
函数,构建网络框架。
with tf.variable_scope('RCNN'):
initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
net=self.VGG(is_training)
rpn_cls_prob,rpn_bbox_pred,rpn_cls_score,rpn_cls_score_reshape=self.RPN(net,is_training)
rois=self.build_proposals(is_training,rpn_cls_prob,rpn_bbox_pred,rpn_cls_score)
cls_score,cls_prob,bbox_pred=self.build_predictions(net,rois,is_training,initializer,initializer_bbox)
return rois, cls_prob, bbox_pred
原始特征提取步骤使用经典的VGG16网络
然后将得到的输出net
传到RPN层里
再将得到的对于proposal的预测是前景还是后景(cls),以及具体位置预测传入生成ROIS的函数里
最后将ROIS放入预测函数。
2. self.VGG(self,is_training)
提取图像特征层
with tf.variable_scope("VGG"):
with slim.arg_scope([slim.conv2d],stride=1,padding="SAME",activation_fn=tf.nn.relu):
with slim.arg_scope([slim.max_pool2d],stride=2,padding="VALID"):
conv1_1=slim.conv2d(self._image,64,[3,3],scope='conv1_1_3x3')
conv1_2=slim.conv2d(conv1_1,64,[3,3],scope='conv1_2_3x3')
self._vgg["conv1_2"]=conv1_2
pool1=slim.max_pool2d(conv1_2,[2,2],scope='pool1_2x2')
conv2_1=slim.conv2d(pool1,128,[3,3],scope='conv2_1_3x3')
conv2_2=slim.conv2d(conv2_1,128,[3,3],scope='conv2_2_3x3')
self._vgg["conv2_2"]=conv2_2
pool2=slim.max_pool2d(conv2_2,[2,2],scope='pool2_2x2')
conv3_1=slim.conv2d(pool2,256,[3,3],scope='conv3_1_3x3')
conv3_2=slim.conv2d(conv3_1,256,[3,3],scope='conv3_2_3x3')
conv3_3=slim.conv2d(conv3_2,256,[3,3],scope='conv3_3_3x3')
self._vgg["conv3_3"]=conv3_3
pool3=slim.max_pool2d(conv3_3,[2,2],scope='pool3_2x2')
conv4_1=slim.conv2d(pool3,512,[3,3],scope='conv4_1_3x3')
conv4_2=slim.conv2d(conv4_1,512,[3,3],scope='conv4_2_3x3')
conv4_3=slim.conv2d(conv4_2,512,[3,3],scope='conv4_3_3x3')
self._vgg["conv4_3"]=conv4_3
pool4=slim.max_pool2d(conv4_3,[2,2],scope='pool4_2x2')
conv5_1=slim.conv2d(pool4,512,[3,3],scope='conv5_1_3x3')
conv5_2=slim.conv2d(conv5_1,512,[3,3],scope='conv5_2_3x3')
conv5_3=slim.conv2d(conv5_2,512,[3,3],scope='conv5_3_3x3')
self._vgg["conv5_3"]=conv5_3
return conv5_3
#obtain a tensor with a channel number of 512 and unfixed long, wide
VGG网络中每一个卷积层都是使用的3*3的卷积核,padding=‘SAME’,步长为1,池化层都是2*2,步长为2,padding=‘VALID’,即卷积层不改变长宽,池化层使之缩小二分之一,最后变成1/16。由于都是重复的卷积层,可以直接使用slim.repeat()函数。
3. self.RPN(tensor,is_training)
rpn层,代替了fastRCNN网络中的selectivesearch,速度上升了很多。
with tf.variable_scope("RPN"):
rpn_conv1=slim.conv2d(tensor,512,[3,3],padding="SAME",stride=1,
trainable=is_training,
weights_initializer=tf.truncated_normal_initializer(0.01),
activation_fn=tf.nn.relu,name="rpn_conv1")
rpn_cls_score=slim.conv2d(rpn_conv1,self._num_anchors*2,[1,1],padding="VALID",stride=1,
trainable=is_training,
weights_initializer=tf.truncated_normal_initializer(0.01),
activation_fn=tf.nn.sigmoid,name="rpn_cls")#Each anchor binary classification
rpn_cls_score_reshape=self._reshape_layer(rpn_cls_score,2,'rpn_cls_score_reshape')
rpn_cls_prob_reshape=self._softmax_layer(rpn_cls_score_reshape,"rpn_cls_prob_reshape")
rpn_cls_prob=self._reshape_layer(rpn_cls_prob_reshape,self._num_anchors*2,"rpn_cls_prob")
rpn_bbox_pred=slim.conv2d(rpn,self._num_anchors*4,[1,1],trainable=is_training,
weights_initializer=initializer,padding='VALID',activation_fn=None,scope='rpn_bbox_pred')
self._rpn["rpn_conv1"]=rpn_conv1
self._rpn["rpn_cls_prob"]=rpn_cls_prob
self._rpn["rpn_bbox_pred"]=rpn_bbox_pred
self._rpn["rpn_cls_score"]=rpn_cls_score
self._rpn["rpn_cls_score_reshape"]=rpn_cls_score_reshape
return rpn_cls_prob,rpn_bbox_pred,rpn_cls_score,rpn_cls_score_reshape
RPN层首先对VGG得到的feature map进行3x3的卷积(可能是为了语义空间转换?某乎看到的。。),然后利用两个1x1的卷积分别进行二分类(背景还是目标)和位置回归。进行分类的卷积核通道数为9×2(9个anchor,每个anchor二分类,sigmod作为激活函数),进行位置回归的卷积核通道数为9×4(9个anchor,每个anchor有4个位置参数,没有激活函数)。RPN是一个全卷积网络(fully convolutional network),这样对输入图片的尺寸就没有要求了。
在这段代码里还用到啦self._reshape_layer(self,bottom,num_dim,name)
,这个函数主要是将tensor的通道数修改为num_dim,因为是进行二分类,所以需要将9个anchor都进行softmax的操作,即使用函数self._softmax_layer(self,bottom,name)
。下面是这两个函数。
def _softmax_layer(self,bottom,name):
if name=='rpn_cls_prob_reshape':
input_shape=tf.shape(bottom)
bottom_reshaped=tf.reshape(bottom,[-1,input_shape[-1]])
reshaped_score=tf.nn.softmax(bottom_reshaped,name=name)
return tf.reshape(reshaped_score,input_shape)
return tf.nn.softmax(bottom,name=name)
def _reshape_layer(self,bottom,num_