普通3d稀疏卷积RuleBook构建
我们继续看普通稀疏卷积RuleBook的建立过程,返回src/spconv/spconv_ops.cc
,看getIndicePairs
函数的普通3D稀疏卷积部分
auto indicePairUnique = torch::full({indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor outInds = torch::zeros({numAct * kernelVolume, coorDim + 1},torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize, stride,padding, dilation, outSpatialShape, transpose, false, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
numActOut = create_conv_indice_pair_p1_cuda(indices,
indicePairs,
indiceNum,
indicePairUnique,
kernelSize,
stride,
padding,
dilation,
outSpatialShape,
transpose
);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = create_conv_indice_pair_p2_cuda(indices,
outInds,
gridOut,
indicePairs,
indiceNum,
indicePairUnique,
outSpatialShape,
transpose,
false,
useHash);
if (numActOut == -1) {
auto device = indices.device();
outInds = outInds.to({torch::kCPU});
indicePairs = indicePairs.to({torch::kCPU});
indiceNum = indiceNum.to({torch::kCPU});
indices = indices.to({torch::kCPU});
numActOut = create_conv_indice_pair_cpu(indices, outInds, gridOut, indicePairs, indiceNum, kernelSize,stride, padding, dilation, outSpatialShape, transpose, false,useHash);
return {outInds.to(device).slice(0, 0, numActOut),indicePairs.to(device), indiceNum.to(device)};
}
}
}
普通3d稀疏卷积调用create_conv_indice_pair_p1_cuda
和create_conv_indice_pair_p2_cuda
,我们先看
create_conv_indice_pair_p1_cuda
函数,位于src/spconv/indice.cu
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
torch::Tensor indicePairUnique,
std::vector<int64_t> kernelSize,
std::vector<int64_t> stride,
std::vector<int64_t> padding,
std::vector<int64_t> dilation,
std::vector<int64_t> outSpatialShape,
bool transpose
) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = kernelSize.size();
auto numActIn = indicesIn.size(0);
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
tv::DispatchInt<max_kernel_vol_t>()(kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
if (transpose) {
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st,pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
} else {
prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique),
ks,
st,
pa,
di,
ou
);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
}
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr,
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
MaxKernelVolume, ">", attr.numRegs);
#endif
});
});
});
return 1;
}
重点看prepareIndicePairsKernel
核函数
template <typename Index, unsigned NDim, int KernelMaxVolume = 256,typename Index1D = int>
__global__ void prepareIndicePairsKernel(
tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index1D> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape
) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1,
kernelSize.data(),
stride.data(),
padding.data(),
dilation.data(),
outSpatialShape.data(),
validPoints
);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(0, offset, oldNum) = ix;
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
}
}
}
getValidOutPos
作用根据输入点计算输出哈希表和输出所用到的卷积核权重的位置,同时返回有效输出个数
直接看下列代码,注释比较详细了
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
const Index *kernelSize,
const Index *stride,
const Index *padding,
const Index *dilation,
const Index *outSpatialShape,
Index *out
) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + stride[i] + padding[i]) / stride[i];
uppers[i] = (input_pos[i] + padding[i]) / stride[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
}
offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
关于输出上下限如何得出,计算过程如下:
以 1-dim
卷积为例:给定输入点的输出点取决于内核大小 k
、步长 s
、扩张 d
和填充 p
对于输入位置 x,它到特征图边界的距离为:
x
+
p
x+p
x+p
假设输出点的最小值为n,有以下关系:
s
∗
(
n
−
1
)
+
k
′
=
x
+
p
s*(n-1)+k'=x+p
s∗(n−1)+k′=x+p
其中
k
′
k'
k′是有效内核大小,它取决于内核大小和膨胀:
k
′
=
(
k
−
1
)
∗
(
d
−
1
)
+
k
k'=(k-1)*(d-1)+k
k′=(k−1)∗(d−1)+k
带入
k
′
k'
k′等式变为:
s
∗
(
n
−
1
)
+
(
k
−
1
)
∗
(
d
−
1
)
+
k
=
x
+
p
s*(n-1)+(k-1)*(d-1)+k=x+p
s∗(n−1)+(k−1)∗(d−1)+k=x+p
重新排列,计算lowers为:
n
=
(
x
−
d
∗
(
k
−
1
)
−
1
+
s
+
p
)
/
s
n=(x-d*(k-1)-1+s+p)/s
n=(x−d∗(k−1)−1+s+p)/s
同理,假设输出点的最大值为n,则有如下关系:
s
∗
n
=
x
+
p
s*n=x+p
s∗n=x+p
则计算uppers为:
n
=
(
x
+
p
)
/
s
n=(x+p)/s
n=(x+p)/s
参考:https://github.com/traveller59/spconv/issues/224
对于counter变量含义可以参考注释代码,如哪些地方理解有误,也麻烦大家指出来。
create_conv_indice_pair_p2_cuda
位于:src/spconv/indice.cu
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, // torch.Size([N, 4]) indices
torch::Tensor indicesOut, // torch.Size([N*27, 4])
torch::Tensor gridsOut, // [4,21*720*720]
torch::Tensor indicePairs, // torch.Size([2,27,N])
torch::Tensor indiceNum, // torch.Size([27]) 用于保存卷积核每一个位置上的总的计算的次数
torch::Tensor indicePairUnique, // N*27+1
std::vector<int64_t> outSpatialShape, // [21, 720, 720]
bool transpose, // False
bool resetGrid, // False
bool useHash // False
) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size(); // 3
auto numActIn = indicesIn.size(0); // N
int batchSize = gridsOut.size(0); // 4
int numAct = indicePairUnique.size(0) - 1;// 不重复输出序号个数-1
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
bool failed = false;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
using IndexGrid = int32_t;
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),outSpatialShape.end());
if (useHash) { // False
...... // 略
} else { // True
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,stream>>>(
tv::torch2tv<Index>(indicesOut), // torch.Size([N*27, 4])
tv::torch2tv<IndexGrid>(gridsOut), // [4,21*720*720]
numAct, // 不重复输出序号个数-1
tv::torch2tv<Index>(indicePairs), // torch.Size([2,27,N])
tv::torch2tv<Index>(indicePairUnique), // 不重复输出序号
ou, // 输出形状
batchSize // 4
);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numActIn,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
#ifdef TV_LOG_KERNEL_INFO
...... // 日志略
#endif
}
if (resetGrid && (!useHash)) { // False
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,stream>>>(indicePairUnique.data_ptr<Index>(),tv::torch2tv<IndexGrid>(gridsOut), numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
if (failed){
return -1;
}
return numAct;
}
assignGridAndIndiceOutKernel
位于:include/spconv/indice.cu.h
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignGridAndIndiceOutKernel(
tv::TensorView<Index> indicesOut, // torch.Size([N*27, 4]) 需要计算的
tv::TensorView<IndexGrid> gridsOut, // [4,21*720*720] 需要计算的
int numAct, // 不重复输出序号个数-1
tv::TensorView<Index> indicePairs, // torch.Size([2,27,N])
tv::TensorView<Index> indicePairUnique, // 不重复输出序号
const tv::SimpleVector<Index, NDim> outSpatialShape, // 输出形状
int batchSize // 4
) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (int ix : tv::KernelLoopX<int>(numAct)) {
index = indicePairUnique[ix];
gridsOut[index] = ix;
index = tv::rowArrayIdxInv<Index, NDim>(index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
indicesOut[ix * (NDim + 1)] = index % batchSize;
}
}
rowArrayIdxInv
位于:include/tensorview/tensorview.h
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,const Index *shape) {
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
output[i] = index % shape[i];
index -= output[i];
index /= shape[i];
}
return index;
}
继续看assignIndicePairsKernel
:
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
Index index;
int kernelVolume = indicePairs.dim(1);
auto indicePairsOut = indicePairs.subview(1); // 从rulebook中获取输出张量到输出序号的哈希表
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairsOut(i, ix);
if (index > -1) {
indicePairsOut(i, ix) = gridsOut[index];
}
}
}
}
subview
位于:include/tensorview/tensorview.h
,意思应该获取子集
TV_HOST_DEVICE_INLINE TensorView<T, -1, PtrTraits, Tindex>
subview(SimpleVector<int> ids) const {
Shape start = ids;
for (int i = ids.size(); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<T, Rank, PtrTraits, Tindex>(
ptr_ + rowArrayIdx(shape_, start), shape_.subshape(ids.size()));
}
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)