Args: in_channels (int): Number of channels in the input tensor,输入张量中的通道数
out_channels (int): Number of channels produced by the convolution,卷积提供的通道数
kernel_size (int or tuple): Size of the convolving kernel,卷积核大小
stride (int or tuple, optional): Stride of the convolution. Default: 1,卷积的步长。 默认值:1
padding (int or tuple, optional): Zero-padding added to the sides of the input during their respective convolutions. Default: 0,在它们各自的卷积期间将零填充添加到输入的边。 默认值:0
bias (bool, optional): If True, adds a learnable bias to the output. Default: True,
classSpatioTemporalConv(nn.Module):
r"""通过在空间轴和时间轴上执行2D卷积到中间子空间,然后在时间轴上执行1D卷积以产生最终输出
Args:in_channels(int): Number of channels in the input tensor,输入张量中的通道数
out_channels(int): Number of channels produced by the convolution,卷积提供的通道数
kernel_size(int or tuple): Size of the convolving kernel
stride(int or tuple, optional): Stride of the convolution. Default:1,卷积的步伐。 默认值:1padding(int or tuple, optional): Zero-padding added to the sides of the input during their respective convolutions. Default:0,
在它们各自的卷积期间将零填充添加到输入的边。 默认值:0bias(bool, optional): If ``True``, adds a learnable bias to the output. Default:``True``,
偏见(布尔型,可选):如果为``True'',则向输出添加可学习的偏见。 默认值:``True``"""
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=False, first_conv=False):super(SpatioTemporalConv, self).__init__()
# if ints are entered, convert them to iterables,1->[1,1,1]
kernel_size =_triple(kernel_size)
stride =_triple(stride)
padding =_triple(padding)if first_conv: # 首层设置
spatial_kernel_size = kernel_size #(1,7,7)
spatial_stride =(1, stride[1], stride[2]) #(1,2,2)
spatial_padding = padding #(0,3,3)
temporal_kernel_size =(3,1,1)
temporal_stride =(stride[0],1,1) #(1,1,1)
temporal_padding =(1,0,0)
# from the official code, first conv's intermed_channels =45
intermed_channels =45
# 空间卷积等价于2D卷积, followed by batch_norm and ReLU
self.spatial_conv = nn.Conv3d(in_channels, intermed_channels, spatial_kernel_size,
stride=spatial_stride, padding=spatial_padding, bias=bias)
self.bn1 = nn.BatchNorm3d(intermed_channels)
# 时间卷积等效于1D卷积,
self.temporal_conv = nn.Conv3d(intermed_channels, out_channels, temporal_kernel_size,
stride=temporal_stride, padding=temporal_padding, bias=bias)
self.bn2 = nn.BatchNorm3d(out_channels)
self.relu = nn.ReLU()else:
# decomposing the parameters into spatial and temporal components by
# masking out the values with the defaults on the axis that
# won't be convolved over. This is necessary to avoid unintentional
# behavior such as padding being added twice
spatial_kernel_size =(1, kernel_size[1], kernel_size[2]) # 一般为(1,3,3)
spatial_stride =(1, stride[1], stride[2]) # stride =2时下采样
spatial_padding =(0, padding[1], padding[2]) #
temporal_kernel_size =(kernel_size[0],1,1)
temporal_stride =(stride[0],1,1)
temporal_padding =(padding[0],0,0)
# 公式计算中间通道数
# from the paper section 3.5
intermed_channels =int(math.floor((kernel_size[0]* kernel_size[1]* kernel_size[2]* in_channels * out_channels)/ \
(kernel_size[1]* kernel_size[2]* in_channels + kernel_size[0]* out_channels)))
# the spatial conv is effectively a 2D conv due to the
# spatial_kernel_size, followed by batch_norm and ReLU
self.spatial_conv = nn.Conv3d(in_channels, intermed_channels, spatial_kernel_size,
stride=spatial_stride, padding=spatial_padding, bias=bias)
self.bn1 = nn.BatchNorm3d(intermed_channels)
# the temporal conv is effectively a 1D conv, but has batch norm
# and ReLU added inside the model constructor, not here. This is an
# intentional design choice, to allow this module to externally act
# identical to a standard Conv3D, so it can be reused easily in any
# other codebase
self.temporal_conv = nn.Conv3d(intermed_channels, out_channels, temporal_kernel_size,
stride=temporal_stride, padding=temporal_padding, bias=bias)
self.bn2 = nn.BatchNorm3d(out_channels)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.bn1(self.spatial_conv(x)))
x = self.relu(self.bn2(self.temporal_conv(x)))return x
classSpatioTemporalResBlock(nn.Module):
r"""Single block for the ResNet network. Uses SpatioTemporalConv in
the standard ResNet block layout(conv->batchnorm->ReLU->conv->batchnorm->sum->ReLU)
ResNet网络的单个块。 在标准ResNet块布局中使用SpatioTemporalConv
Args:in_channels(int): Number of channels in the input tensor.out_channels(int): Number of channels in the output produced by the block.kernel_size(int or tuple): Size of the convolving kernels.downsample(bool, optional): If ``True``, the output size is to be smaller than the input. Default:``False``"""
def __init__(self, in_channels, out_channels, kernel_size, downsample=False):super(SpatioTemporalResBlock, self).__init__()
# If downsample == True, the first conv of the layer has stride =2
# to halve the residual output size, and the input x is passed
# through a seperate 1x1x1 conv with stride =2 to also halve it.
# no pooling layers are used inside ResNet
self.downsample = downsample
# to allow forSAME padding
padding = kernel_size // 2if self.downsample:# 下采样为true,对输入层也得进行图片的长宽压缩
# 下采样输入x,残差的右侧分支部分
self.downsampleconv =SpatioTemporalConv(in_channels, out_channels,1, stride=2) #卷积核1x1x1,直接对输入进行下采样
self.downsamplebn = nn.BatchNorm3d(out_channels) # 接bn3d
# 下采样的主线部分,一次conv包括 空间卷积加时间卷积
self.conv1 =SpatioTemporalConv(in_channels, out_channels, kernel_size, padding=padding, stride=2)else:# 不进行下采样的话 ,右侧分支直接相加就行了
self.conv1 =SpatioTemporalConv(in_channels, out_channels, kernel_size, padding=padding)
self.bn1 = nn.BatchNorm3d(out_channels)
self.relu = nn.ReLU()
# 再进行一次标准的卷积 + bn3d + relu
self.conv2 =SpatioTemporalConv(out_channels, out_channels, kernel_size, padding=padding)
self.bn2 = nn.BatchNorm3d(out_channels)
def forward(self, x):
res = self.relu(self.bn1(self.conv1(x)))
res = self.bn2(self.conv2(res))if self.downsample:
x = self.downsamplebn(self.downsampleconv(x))return self.relu(x + res)
classSpatioTemporalResLayer(nn.Module):"""
形成ResNet网络的单层,并重复多次输出大小相同的块彼此堆叠
Args:in_channels(int): Number of channels in the input tensor.out_channels(int): Number of channels in the output produced by the layer.kernel_size(int or tuple): Size of the convolving kernels.layer_size(int): Number of blocks to be stacked to form the layer
block_type(Module, optional): Type of block that is to be used to form the layer. Default: SpatioTemporalResBlock.downsample(bool, optional): If ``True``, the first block in layer will implement downsampling. Default:``False``"""
def __init__(self, in_channels, out_channels, kernel_size, layer_size, block_type=SpatioTemporalResBlock,
downsample=False):super(SpatioTemporalResLayer, self).__init__()
# 首层,采用SpatioTemporalResBlock的 有下采样的结构。
self.block1 =block_type(in_channels, out_channels, kernel_size, downsample)
# 接下来重复进行layer_size -1次不进行下采样的结构,依次堆叠;
# layer_size为2时,即再进行1次不进行下采样的SpatioTemporalResBlock结构
self.blocks = nn.ModuleList([])for i inrange(layer_size -1):
# 所有这些块都是相同的,并且默认情况下downsample = False
self.blocks +=[block_type(out_channels, out_channels, kernel_size)]
def forward(self, x):
x = self.block1(x)for block in self.blocks:
x =block(x)return x
2Plus1DNet
classR2Plus1DNet(nn.Module):
r"""Forms the overall ResNet feature extractor by initializng 5 layers,with the number of blocks in
each layer set by layer_sizes, and by performing a global average pool at the end producing a
512-dimensional vector for each element in the batch.
通过初始化5层,并通过layer_sizes设置每层中的块数,并最后通过执行全局平均池来为批次中的每个元素生成512维向量,来形成整个ResNet特征提取器。
Args:layer_sizes(tuple): An iterable containing the number of blocks in each layer
block_type(Module, optional): Type of block that is to be used to form the layers. Default: SpatioTemporalResBlock."""
def __init__(self, layer_sizes, block_type=SpatioTemporalResBlock):super(R2Plus1DNet, self).__init__()
# 第一层,输入通道为3,卷积核大小(1,7,7), stride=(1,2,2)在三维方向上步长是1,在宽和高上步长是2进行下采样,
# padding为(0,3,3),宽高padding为 7/2=3
self.conv1 =SpatioTemporalConv(3,64,(1,7,7), stride=(1,2,2), padding=(0,3,3), first_conv=True)
# 第二层输出和第一层输出大小相同,不进行下采样通道数也不变,卷积核大小为3x3x3
self.conv2 =SpatioTemporalResLayer(64,64,3, layer_sizes[0], block_type=block_type)
# 最后三层的输出通道数二倍于输入通道数,而且进行下采样,在每一层的第一个block进行下采样
self.conv3 =SpatioTemporalResLayer(64,128,3, layer_sizes[1], block_type=block_type, downsample=True)
self.conv4 =SpatioTemporalResLayer(128,256,3, layer_sizes[2], block_type=block_type, downsample=True)
self.conv5 =SpatioTemporalResLayer(256,512,3, layer_sizes[3], block_type=block_type, downsample=True)
# global average pooling of the output
self.pool = nn.AdaptiveAvgPool3d(1)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.conv5(x)
x = self.pool(x)return x.view(-1,512)