上一篇文章中提到 warpAffine 会分块处理,将坐标映射和插值系数分别存储下来,然后借助 remap 来实现最终的映射。而 remap 会根据映射关系取源像素并加权计算出目的像素值。其最核心的计算为 RemapVec_8u 。
cv::remap
RemapInvoker
nnfunc
ifunc
d
s
t
(
x
,
y
)
=
s
r
c
(
m
a
p
x
(
x
,
y
)
,
m
a
p
y
(
x
,
y
)
)
\mathrm{dst}(x,y)=\mathrm{src}(\mathrm{map}_x(x,y), \mathrm{map}_y(x,y))
dst ( x , y ) = src ( map x ( x , y ) , map y ( x , y )) 定义不同类型的函数表。 remapBilinear
CV_INSTRUMENT_REGION ( ) ;
static RemapNNFunc nn_tab[ ] =
{
remapNearest< uchar> , remapNearest< schar> , remapNearest< ushort> , remapNearest< short > ,
remapNearest< int > , remapNearest< float > , remapNearest< double > , 0
} ;
static RemapFunc linear_tab[ ] =
{
remapBilinear< FixedPtCast< int , uchar, INTER_REMAP_COEF_BITS> , RemapVec_8u, short > , 0 ,
remapBilinear< Cast< float , ushort> , RemapNoVec, float > ,
remapBilinear< Cast< float , short > , RemapNoVec, float > , 0 ,
remapBilinear< Cast< float , float > , RemapNoVec, float > ,
remapBilinear< Cast< double , double > , RemapNoVec, float > , 0
} ;
static RemapFunc cubic_tab[ ] =
{
remapBicubic< FixedPtCast< int , uchar, INTER_REMAP_COEF_BITS> , short , INTER_REMAP_COEF_SCALE> , 0 ,
remapBicubic< Cast< float , ushort> , float , 1 > ,
remapBicubic< Cast< float , short > , float , 1 > , 0 ,
remapBicubic< Cast< float , float > , float , 1 > ,
remapBicubic< Cast< double , double > , float , 1 > , 0
} ;
static RemapFunc lanczos4_tab[ ] =
{
remapLanczos4< FixedPtCast< int , uchar, INTER_REMAP_COEF_BITS> , short , INTER_REMAP_COEF_SCALE> , 0 ,
remapLanczos4< Cast< float , ushort> , float , 1 > ,
remapLanczos4< Cast< float , short > , float , 1 > , 0 ,
remapLanczos4< Cast< float , float > , float , 1 > ,
remapLanczos4< Cast< double , double > , float , 1 > , 0
} ;
_map1
不能为空,_map2
可以。
CV_Assert ( ! _map1. empty ( ) ) ;
CV_Assert ( _map2. empty ( ) || ( _map2. size ( ) == _map1. size ( ) ) ) ;
CV_OCL_RUN ( _src. dims ( ) <= 2 && _dst. isUMat ( ) ,
ocl_remap ( _src, _dst, _map1, _map2, interpolation, borderType, borderValue) )
Mat src = _src. getMat ( ) , map1 = _map1. getMat ( ) , map2 = _map2. getMat ( ) ;
_dst. create ( map1. size ( ) , src. type ( ) ) ;
Mat dst = _dst. getMat ( ) ;
CV_OVX_RUN (
src. type ( ) == CV_8UC1 && dst. type ( ) == CV_8UC1 &&
! ovx:: skipSmallImages < VX_KERNEL_REMAP> ( src. cols, src. rows) &&
( borderType& ~ BORDER_ISOLATED) == BORDER_CONSTANT &&
( ( map1. type ( ) == CV_32FC2 && map2. empty ( ) && map1. size == dst. size) ||
( map1. type ( ) == CV_32FC1 && map2. type ( ) == CV_32FC1 && map1. size == dst. size && map2. size == dst. size) ||
( map1. empty ( ) && map2. type ( ) == CV_32FC2 && map2. size == dst. size) ) &&
( ( borderType & BORDER_ISOLATED) != 0 || ! src. isSubmatrix ( ) ) ,
openvx_remap ( src, dst, map1, map2, interpolation, borderValue) ) ;
输入输出不能过大。 如果输入输出相同则进行拷贝。
CV_Assert ( dst. cols < SHRT_MAX && dst. rows < SHRT_MAX && src. cols < SHRT_MAX && src. rows < SHRT_MAX ) ;
if ( dst. data == src. data )
src = src. clone ( ) ;
if ( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
int type = src. type ( ) , depth = CV_MAT_DEPTH ( type) ;
# if defined HAVE_IPP && ! IPP_DISABLE_REMAP
CV_IPP_CHECK ( )
{
if ( ( interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
map1. type ( ) == CV_32FC1 && map2. type ( ) == CV_32FC1 &&
( borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT) )
{
int ippInterpolation =
interpolation == INTER_NEAREST ? IPPI_INTER_NN :
interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
ippiRemap ippFunc =
type == CV_8UC1 ? ( ippiRemap) ippiRemap_8u_C1R :
type == CV_8UC3 ? ( ippiRemap) ippiRemap_8u_C3R :
type == CV_8UC4 ? ( ippiRemap) ippiRemap_8u_C4R :
type == CV_16UC1 ? ( ippiRemap) ippiRemap_16u_C1R :
type == CV_16UC3 ? ( ippiRemap) ippiRemap_16u_C3R :
type == CV_16UC4 ? ( ippiRemap) ippiRemap_16u_C4R :
type == CV_32FC1 ? ( ippiRemap) ippiRemap_32f_C1R :
type == CV_32FC3 ? ( ippiRemap) ippiRemap_32f_C3R :
type == CV_32FC4 ? ( ippiRemap) ippiRemap_32f_C4R : 0 ;
if ( ippFunc)
{
bool ok;
IPPRemapInvoker invoker ( src, dst, map1, map2, ippFunc, ippInterpolation,
borderType, borderValue, & ok) ;
Range range ( 0 , dst. rows) ;
parallel_for_ ( range, invoker, dst. total ( ) / ( double ) ( 1 << 16 ) ) ;
if ( ok)
{
CV_IMPL_ADD ( CV_IMPL_IPP| CV_IMPL_MT) ;
return ;
}
setIppErrorStatus ( ) ;
}
}
}
# endif
如果是最近邻插值设置nnfunc
,否则设置ifunc
。 initInterTab2D 返回静态数组的指针给ctab
。
RemapNNFunc nnfunc = 0 ;
RemapFunc ifunc = 0 ;
const void * ctab = 0 ;
bool fixpt = depth == CV_8U;
bool planar_input = false ;
if ( interpolation == INTER_NEAREST )
{
nnfunc = nn_tab[ depth] ;
CV_Assert ( nnfunc != 0 ) ;
}
else
{
if ( interpolation == INTER_LINEAR )
ifunc = linear_tab[ depth] ;
else if ( interpolation == INTER_CUBIC ) {
ifunc = cubic_tab[ depth] ;
CV_Assert ( _src. channels ( ) <= 4 ) ;
}
else if ( interpolation == INTER_LANCZOS4 ) {
ifunc = lanczos4_tab[ depth] ;
CV_Assert ( _src. channels ( ) <= 4 ) ;
}
else
CV_Error ( CV_StsBadArg, "Unknown interpolation method" ) ;
CV_Assert ( ifunc != 0 ) ;
ctab = initInterTab2D ( interpolation, fixpt ) ;
}
const Mat * m1 = & map1, * m2 = & map2;
if ( ( map1. type ( ) == CV_16SC2 && ( map2. type ( ) == CV_16UC1 || map2. type ( ) == CV_16SC1 || map2. empty ( ) ) ) ||
( map2. type ( ) == CV_16SC2 && ( map1. type ( ) == CV_16UC1 || map1. type ( ) == CV_16SC1 || map1. empty ( ) ) ) )
{
if ( map1. type ( ) != CV_16SC2 )
std:: swap ( m1, m2) ;
}
else
{
CV_Assert ( ( ( map1. type ( ) == CV_32FC2 || map1. type ( ) == CV_16SC2) && map2. empty ( ) ) ||
( map1. type ( ) == CV_32FC1 && map2. type ( ) == CV_32FC1) ) ;
planar_input = map1. channels ( ) == 1 ;
}
调用 RemapInvoker 的函数。
RemapInvoker invoker ( src, dst, m1, m2,
borderType, borderValue, planar_input, nnfunc, ifunc,
ctab) ;
parallel_for_ ( Range ( 0 , dst. rows) , invoker, dst. total ( ) / ( double ) ( 1 << 16 ) ) ;
initInterTab2D
initInterTab1D
initInterTab2D 存储_tab
中两个核参数相乘的结果。
static bool inittab[ INTER_MAX+ 1 ] = { false } ;
float * tab = 0 ;
short * itab = 0 ;
int ksize = 0 ;
if ( method == INTER_LINEAR )
tab = BilinearTab_f[ 0 ] [ 0 ] , itab = BilinearTab_i[ 0 ] [ 0 ] , ksize= 2 ;
else if ( method == INTER_CUBIC )
tab = BicubicTab_f[ 0 ] [ 0 ] , itab = BicubicTab_i[ 0 ] [ 0 ] , ksize= 4 ;
else if ( method == INTER_LANCZOS4 )
tab = Lanczos4Tab_f[ 0 ] [ 0 ] , itab = Lanczos4Tab_i[ 0 ] [ 0 ] , ksize= 8 ;
else
CV_Error ( CV_StsBadArg, "Unknown/unsupported interpolation type" ) ;
initInterTab1D 计算不同方法的一维表值,即原始系数。 INTER_TAB_SIZE 为
2
I
N
T
E
R
_
B
I
T
S
2^{\mathrm{INTER\_BITS}}
2 INTER_BITS ,表的大小取决于插值位宽。 tab
尺寸为 INTER_TAB_SIZE2 ,从而能存储不同系数就的乘积。 NNDeltaTab_i
的作用是什么? 内层循环计算中心系数乘积。vy
为当前行的中心系数值。 INTER_REMAP_COEF_SCALE 为
2
I
N
T
E
R
_
R
E
M
A
P
_
C
O
E
F
_
B
I
T
S
2^{\mathrm{INTER\_REMAP\_COEF\_BITS}}
2 INTER_REMAP_COEF_BITS 。 isum
为itab
当前位置上的中心元素之和。
if ( ! inittab[ method] )
{
AutoBuffer< float > _tab ( 8 * INTER_TAB_SIZE) ;
int i, j, k1, k2;
initInterTab1D ( method, _tab. data ( ) , INTER_TAB_SIZE) ;
for ( i = 0 ; i < INTER_TAB_SIZE; i++ )
for ( j = 0 ; j < INTER_TAB_SIZE; j++ , tab += ksize* ksize, itab += ksize* ksize )
{
int isum = 0 ;
NNDeltaTab_i[ i* INTER_TAB_SIZE+ j] [ 0 ] = j < INTER_TAB_SIZE/ 2 ;
NNDeltaTab_i[ i* INTER_TAB_SIZE+ j] [ 1 ] = i < INTER_TAB_SIZE/ 2 ;
for ( k1 = 0 ; k1 < ksize; k1++ )
{
float vy = _tab[ i* ksize + k1] ;
for ( k2 = 0 ; k2 < ksize; k2++ )
{
float v = vy* _tab[ j* ksize + k2] ;
tab[ k1* ksize + k2] = v;
isum += itab[ k1* ksize + k2] = saturate_cast < short > ( v* INTER_REMAP_COEF_SCALE) ;
}
}
计算完成后,tab
和itab
重新指向首地址。
if ( isum != INTER_REMAP_COEF_SCALE )
{
int diff = isum - INTER_REMAP_COEF_SCALE;
int ksize2 = ksize/ 2 , Mk1= ksize2, Mk2= ksize2, mk1= ksize2, mk2= ksize2;
for ( k1 = ksize2; k1 < ksize2+ 2 ; k1++ )
for ( k2 = ksize2; k2 < ksize2+ 2 ; k2++ )
{
if ( itab[ k1* ksize+ k2] < itab[ mk1* ksize+ mk2] )
mk1 = k1, mk2 = k2;
else if ( itab[ k1* ksize+ k2] > itab[ Mk1* ksize+ Mk2] )
Mk1 = k1, Mk2 = k2;
}
if ( diff < 0 )
itab[ Mk1* ksize + Mk2] = ( short ) ( itab[ Mk1* ksize + Mk2] - diff) ;
else
itab[ mk1* ksize + mk2] = ( short ) ( itab[ mk1* ksize + mk2] - diff) ;
}
}
tab -= INTER_TAB_SIZE2* ksize* ksize;
itab -= INTER_TAB_SIZE2* ksize* ksize;
# if CV_SIMD128
if ( method == INTER_LINEAR )
{
for ( i = 0 ; i < INTER_TAB_SIZE2; i++ )
for ( j = 0 ; j < 4 ; j++ )
{
BilinearTab_iC4[ i] [ 0 ] [ j* 2 ] = BilinearTab_i[ i] [ 0 ] [ 0 ] ;
BilinearTab_iC4[ i] [ 0 ] [ j* 2 + 1 ] = BilinearTab_i[ i] [ 0 ] [ 1 ] ;
BilinearTab_iC4[ i] [ 1 ] [ j* 2 ] = BilinearTab_i[ i] [ 1 ] [ 0 ] ;
BilinearTab_iC4[ i] [ 1 ] [ j* 2 + 1 ] = BilinearTab_i[ i] [ 1 ] [ 1 ] ;
}
}
# endif
inittab[ method] = true ;
}
return fixpt ? ( const void * ) itab : ( const void * ) tab;
initInterTab1D
interpolateLinear
float scale = 1.f / tabsz;
if ( method == INTER_LINEAR )
{
for ( int i = 0 ; i < tabsz; i++ , tab += 2 )
interpolateLinear ( i* scale, tab ) ;
}
else if ( method == INTER_CUBIC )
{
for ( int i = 0 ; i < tabsz; i++ , tab += 4 )
interpolateCubic ( i* scale, tab ) ;
}
else if ( method == INTER_LANCZOS4 )
{
for ( int i = 0 ; i < tabsz; i++ , tab += 8 )
interpolateLanczos4 ( i* scale, tab ) ;
}
else
CV_Error ( CV_StsBadArg, "Unknown interpolation method" ) ;
计算插值系数。
coeffs[ 0 ] = 1.f - x;
coeffs[ 1 ] = x;
public :
RemapInvoker ( const Mat& _src, Mat& _dst, const Mat * _m1,
const Mat * _m2, int _borderType, const Scalar & _borderValue,
int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void * _ctab) :
ParallelLoopBody ( ) , src ( & _src) , dst ( & _dst) , m1 ( _m1) , m2 ( _m2) ,
borderType ( _borderType) , borderValue ( _borderValue) ,
planar_input ( _planar_input) , nnfunc ( _nnfunc) , ifunc ( _ifunc) , ctab ( _ctab)
{
}
brows0
为缓冲区行数,bcols0
为列数。 dpart
为目的矩阵的当前处理区域。 _bufxy
为x
和y
的缓冲区。bufxy
为当前 RoI 区域。
virtual void operator ( ) ( const Range& range) const CV_OVERRIDE
{
int x, y, x1, y1;
const int buf_size = 1 << 14 ;
int brows0 = std:: min ( 128 , dst-> rows) , map_depth = m1-> depth ( ) ;
int bcols0 = std:: min ( buf_size/ brows0, dst-> cols) ;
brows0 = std:: min ( buf_size/ bcols0, dst-> rows) ;
Mat _bufxy ( brows0, bcols0, CV_16SC2) , _bufa;
if ( ! nnfunc )
_bufa. create ( brows0, bcols0, CV_16UC1) ;
for ( y = range. start; y < range. end; y += brows0 )
{
for ( x = 0 ; x < dst-> cols; x += bcols0 )
{
int brows = std:: min ( brows0, range. end - y) ;
int bcols = std:: min ( bcols0, dst-> cols - x) ;
Mat dpart ( * dst, Rect ( x, y, bcols, brows) ) ;
Mat bufxy ( _bufxy, Rect ( 0 , 0 , bcols, brows) ) ;
如果为最近邻映射。
if ( nnfunc )
{
if ( m1-> type ( ) == CV_16SC2 && m2-> empty ( ) ) // the data is already in the right format
bufxy = ( * m1) ( Rect ( x, y, bcols, brows) ) ;
else if ( map_depth != CV_32F )
{
for ( y1 = 0 ; y1 < brows; y1++ )
{
short * XY = bufxy. ptr < short > ( y1) ;
const short * sXY = m1-> ptr < short > ( y+ y1) + x* 2 ;
const ushort* sA = m2-> ptr < ushort> ( y+ y1) + x;
for ( x1 = 0 ; x1 < bcols; x1++ )
{
int a = sA[ x1] & ( INTER_TAB_SIZE2- 1 ) ;
XY[ x1* 2 ] = sXY[ x1* 2 ] + NNDeltaTab_i[ a] [ 0 ] ;
XY[ x1* 2 + 1 ] = sXY[ x1* 2 + 1 ] + NNDeltaTab_i[ a] [ 1 ] ;
}
}
}
else if ( ! planar_input )
( * m1) ( Rect ( x, y, bcols, brows) ) . convertTo ( bufxy, bufxy. depth ( ) ) ;
else
{
for ( y1 = 0 ; y1 < brows; y1++ )
{
short * XY = bufxy. ptr < short > ( y1) ;
const float * sX = m1-> ptr < float > ( y+ y1) + x;
const float * sY = m2-> ptr < float > ( y+ y1) + x;
x1 = 0 ;
# if CV_SIMD128
{
int span = v_float32x4:: nlanes;
for ( ; x1 <= bcols - span * 2 ; x1 += span * 2 )
{
v_int32x4 ix0 = v_round ( v_load ( sX + x1) ) ;
v_int32x4 iy0 = v_round ( v_load ( sY + x1) ) ;
v_int32x4 ix1 = v_round ( v_load ( sX + x1 + span) ) ;
v_int32x4 iy1 = v_round ( v_load ( sY + x1 + span) ) ;
v_int16x8 dx, dy;
dx = v_pack ( ix0, ix1) ;
dy = v_pack ( iy0, iy1) ;
v_store_interleave ( XY + x1 * 2 , dx, dy) ;
}
}
# endif
for ( ; x1 < bcols; x1++ )
{
XY[ x1* 2 ] = saturate_cast < short > ( sX[ x1] ) ;
XY[ x1* 2 + 1 ] = saturate_cast < short > ( sY[ x1] ) ;
}
}
}
nnfunc ( * src, dpart, bufxy, borderType, borderValue ) ;
continue ;
}
否则为线性插值。 XY
指向bufxy
的当前行,A
指向bufa
的当前行。 如果m1
为双通道整型且m2
为单通道整型, bufxy
封装m1
的当前处理分块,即源坐标值。 sA
为m2
矩阵的当前位置。 INTER_TAB_SIZE2 为
2
10
2^{10}
2 10 由掩码得到小数部分保存到A
中。
Mat bufa ( _bufa, Rect ( 0 , 0 , bcols, brows) ) ;
for ( y1 = 0 ; y1 < brows; y1++ )
{
short * XY = bufxy. ptr < short > ( y1) ;
ushort* A = bufa. ptr < ushort> ( y1) ;
if ( m1-> type ( ) == CV_16SC2 && ( m2-> type ( ) == CV_16UC1 || m2-> type ( ) == CV_16SC1) )
{
bufxy = ( * m1) ( Rect ( x, y, bcols, brows) ) ;
const ushort* sA = m2-> ptr < ushort> ( y+ y1) + x;
x1 = 0 ;
# if CV_SIMD128
{
v_uint16x8 v_scale = v_setall_u16 ( INTER_TAB_SIZE2 - 1 ) ;
int span = v_uint16x8:: nlanes;
for ( ; x1 <= bcols - span; x1 += span )
v_store ( ( unsigned short * ) ( A + x1) , v_load ( sA + x1) & v_scale) ;
}
# endif
for ( ; x1 < bcols; x1++ )
A[ x1] = ( ushort) ( sA[ x1] & ( INTER_TAB_SIZE2- 1 ) ) ;
}
如果m1
为单通道
else if ( planar_input )
{
const float * sX = m1-> ptr < float > ( y+ y1) + x;
const float * sY = m2-> ptr < float > ( y+ y1) + x;
x1 = 0 ;
# if CV_SIMD128
{
v_float32x4 v_scale = v_setall_f32 ( ( float ) INTER_TAB_SIZE) ;
v_int32x4 v_scale2 = v_setall_s32 ( INTER_TAB_SIZE - 1 ) ;
int span = v_float32x4:: nlanes;
for ( ; x1 <= bcols - span * 2 ; x1 += span * 2 )
{
v_int32x4 v_sx0 = v_round ( v_scale * v_load ( sX + x1) ) ;
v_int32x4 v_sy0 = v_round ( v_scale * v_load ( sY + x1) ) ;
v_int32x4 v_sx1 = v_round ( v_scale * v_load ( sX + x1 + span) ) ;
v_int32x4 v_sy1 = v_round ( v_scale * v_load ( sY + x1 + span) ) ;
v_uint16x8 v_sx8 = v_reinterpret_as_u16 ( v_pack ( v_sx0 & v_scale2, v_sx1 & v_scale2) ) ;
v_uint16x8 v_sy8 = v_reinterpret_as_u16 ( v_pack ( v_sy0 & v_scale2, v_sy1 & v_scale2) ) ;
v_uint16x8 v_v = v_shl < INTER_BITS> ( v_sy8) | ( v_sx8) ;
v_store ( A + x1, v_v) ;
v_int16x8 v_d0 = v_pack ( v_shr < INTER_BITS> ( v_sx0) , v_shr < INTER_BITS> ( v_sx1) ) ;
v_int16x8 v_d1 = v_pack ( v_shr < INTER_BITS> ( v_sy0) , v_shr < INTER_BITS> ( v_sy1) ) ;
v_store_interleave ( XY + ( x1 << 1 ) , v_d0, v_d1) ;
}
}
# endif
for ( ; x1 < bcols; x1++ )
{
int sx = cvRound ( sX[ x1] * INTER_TAB_SIZE) ;
int sy = cvRound ( sY[ x1] * INTER_TAB_SIZE) ;
int v = ( sy & ( INTER_TAB_SIZE- 1 ) ) * INTER_TAB_SIZE + ( sx & ( INTER_TAB_SIZE- 1 ) ) ;
XY[ x1* 2 ] = saturate_cast < short > ( sx >> INTER_BITS) ;
XY[ x1* 2 + 1 ] = saturate_cast < short > ( sy >> INTER_BITS) ;
A[ x1] = ( ushort) v;
}
}
else
{
const float * sXY = m1-> ptr < float > ( y+ y1) + x* 2 ;
x1 = 0 ;
# if CV_SIMD128
{
v_float32x4 v_scale = v_setall_f32 ( ( float ) INTER_TAB_SIZE) ;
v_int32x4 v_scale2 = v_setall_s32 ( INTER_TAB_SIZE - 1 ) , v_scale3 = v_setall_s32 ( INTER_TAB_SIZE) ;
int span = v_float32x4:: nlanes;
for ( ; x1 <= bcols - span * 2 ; x1 += span * 2 )
{
v_float32x4 v_fx, v_fy;
v_load_deinterleave ( sXY + ( x1 << 1 ) , v_fx, v_fy) ;
v_int32x4 v_sx0 = v_round ( v_fx * v_scale) ;
v_int32x4 v_sy0 = v_round ( v_fy * v_scale) ;
v_load_deinterleave ( sXY + ( ( x1 + span) << 1 ) , v_fx, v_fy) ;
v_int32x4 v_sx1 = v_round ( v_fx * v_scale) ;
v_int32x4 v_sy1 = v_round ( v_fy * v_scale) ;
v_int32x4 v_v0 = v_muladd ( v_scale3, ( v_sy0 & v_scale2) , ( v_sx0 & v_scale2) ) ;
v_int32x4 v_v1 = v_muladd ( v_scale3, ( v_sy1 & v_scale2) , ( v_sx1 & v_scale2) ) ;
v_uint16x8 v_v8 = v_reinterpret_as_u16 ( v_pack ( v_v0, v_v1) ) ;
v_store ( A + x1, v_v8) ;
v_int16x8 v_dx = v_pack ( v_shr < INTER_BITS> ( v_sx0) , v_shr < INTER_BITS> ( v_sx1) ) ;
v_int16x8 v_dy = v_pack ( v_shr < INTER_BITS> ( v_sy0) , v_shr < INTER_BITS> ( v_sy1) ) ;
v_store_interleave ( XY + ( x1 << 1 ) , v_dx, v_dy) ;
}
}
# endif
for ( ; x1 < bcols; x1++ )
{
int sx = cvRound ( sXY[ x1* 2 ] * INTER_TAB_SIZE) ;
int sy = cvRound ( sXY[ x1* 2 + 1 ] * INTER_TAB_SIZE) ;
int v = ( sy & ( INTER_TAB_SIZE- 1 ) ) * INTER_TAB_SIZE + ( sx & ( INTER_TAB_SIZE- 1 ) ) ;
XY[ x1* 2 ] = saturate_cast < short > ( sx >> INTER_BITS) ;
XY[ x1* 2 + 1 ] = saturate_cast < short > ( sy >> INTER_BITS) ;
A[ x1] = ( ushort) v;
}
}
}
调用ifunc
完成插值操作。
ifunc ( * src, dpart, bufxy, bufa, ctab, borderType, borderValue) ;
}
}
}
类成员。
private :
const Mat* src;
Mat* dst;
const Mat * m1, * m2;
int borderType;
Scalar borderValue;
int planar_input;
RemapNNFunc nnfunc;
RemapFunc ifunc;
const void * ctab;
remapBilinear
RemapVec_8u
remapBilinear 在必要时填充边界,图像映射通过 RemapVec_8u 实现加速。 sstep
为源图行跨度。MatStep 能够进行隐式类型转换。 cval
为填充像素值。 width1
和height1
为横向和纵向上的最大取值。
typedef typename CastOp :: rtype T;
typedef typename CastOp :: type1 WT;
Size ssize = _src. size ( ) , dsize = _dst. size ( ) ;
const int cn = _src. channels ( ) ;
const AT* wtab = ( const AT* ) _wtab;
const T* S0 = _src. ptr < T> ( ) ;
size_t sstep = _src. step/ sizeof ( S0[ 0 ] ) ;
T cval[ CV_CN_MAX] ;
CastOp castOp;
VecOp vecOp;
for ( int k = 0 ; k < cn; k++ )
cval[ k] = saturate_cast < T> ( _borderValue[ k & 3 ] ) ;
unsigned width1 = std:: max ( ssize. width- 1 , 0 ) , height1 = std:: max ( ssize. height- 1 , 0 ) ;
CV_Assert ( ! ssize. empty ( ) ) ;
# if CV_SIMD128
if ( _src. type ( ) == CV_8UC3 )
width1 = std:: max ( ssize. width- 2 , 0 ) ;
# endif
D
指向目的图上的当前行。 XY
和FXY
指向整数和小数部分的值。 curInlier
判断是否超出边界。 如果dx
在区域内,根据XY
中的值是否超出边界来判断。 X0
是上一次处理完后的行位置,也是本次起点,X1
是本次处理的终点。 dy = 0
时,由于X0=0
,所以跳过不处理。如果prevInlier
和curInlier
相等则跳过不处理。 dx = dsize.width
本是一个不存在的位置,这使得在行尾prevInlier
和curInlier
不相等,马上进行处理。
for ( int dy = 0 ; dy < dsize. height; dy++ )
{
T* D = _dst. ptr < T> ( dy) ;
const short * XY = _xy. ptr < short > ( dy) ;
const ushort* FXY = _fxy. ptr < ushort> ( dy) ;
int X0 = 0 ;
bool prevInlier = false ;
for ( int dx = 0 ; dx <= dsize. width; dx++ )
{
bool curInlier = dx < dsize. width ?
( unsigned ) XY[ dx* 2 ] < width1 &&
( unsigned ) XY[ dx* 2 + 1 ] < height1 : ! prevInlier;
if ( curInlier == prevInlier )
continue ;
int X1 = dx;
dx = X0;
X0 = X1;
prevInlier = curInlier;
f
(
x
,
y
)
=
a
0
α
0
β
0
+
a
1
α
1
β
0
+
b
0
α
0
β
1
+
b
1
α
1
β
1
\begin{aligned} f(x, y) &= a_0\alpha_0\beta_0+ a_1\alpha_1\beta_0 + b_0\alpha_0\beta_1+ b_1\alpha_1\beta_1 \end{aligned}
f ( x , y ) = a 0 α 0 β 0 + a 1 α 1 β 0 + b 0 α 0 β 1 + b 1 α 1 β 1 如果不是内露层,调用vecOp
处理可向量化的数据;剩余部分循环处理。总共处理X1-X0
个像素。 S
为对应到源图的左上点地址,w
为融合后的4个权重参数。以FXY
数组的值作为索引,可以获得4个中心系数乘积。
if ( ! curInlier )
{
int len = vecOp ( _src, D, XY + dx* 2 , FXY + dx, wtab, X1 - dx ) ;
D += len* cn;
dx += len;
if ( cn == 1 )
{
for ( ; dx < X1; dx++ , D++ )
{
int sx = XY[ dx* 2 ] , sy = XY[ dx* 2 + 1 ] ;
const AT* w = wtab + FXY[ dx] * 4 ;
const T* S = S0 + sy* sstep + sx;
* D = castOp ( WT ( S[ 0 ] * w[ 0 ] + S[ 1 ] * w[ 1 ] + S[ sstep] * w[ 2 ] + S[ sstep+ 1 ] * w[ 3 ] ) ) ;
}
}
else if ( cn == 2 )
for ( ; dx < X1; dx++ , D += 2 )
{
int sx = XY[ dx* 2 ] , sy = XY[ dx* 2 + 1 ] ;
const AT* w = wtab + FXY[ dx] * 4 ;
const T* S = S0 + sy* sstep + sx* 2 ;
WT t0 = S[ 0 ] * w[ 0 ] + S[ 2 ] * w[ 1 ] + S[ sstep] * w[ 2 ] + S[ sstep+ 2 ] * w[ 3 ] ;
WT t1 = S[ 1 ] * w[ 0 ] + S[ 3 ] * w[ 1 ] + S[ sstep+ 1 ] * w[ 2 ] + S[ sstep+ 3 ] * w[ 3 ] ;
D[ 0 ] = castOp ( t0) ; D[ 1 ] = castOp ( t1) ;
}
else if ( cn == 3 )
for ( ; dx < X1; dx++ , D += 3 )
{
int sx = XY[ dx* 2 ] , sy = XY[ dx* 2 + 1 ] ;
const AT* w = wtab + FXY[ dx] * 4 ;
const T* S = S0 + sy* sstep + sx* 3 ;
WT t0 = S[ 0 ] * w[ 0 ] + S[ 3 ] * w[ 1 ] + S[ sstep] * w[ 2 ] + S[ sstep+ 3 ] * w[ 3 ] ;
WT t1 = S[ 1 ] * w[ 0 ] + S[ 4 ] * w[ 1 ] + S[ sstep+ 1 ] * w[ 2 ] + S[ sstep+ 4 ] * w[ 3 ] ;
WT t2 = S[ 2 ] * w[ 0 ] + S[ 5 ] * w[ 1 ] + S[ sstep+ 2 ] * w[ 2 ] + S[ sstep+ 5 ] * w[ 3 ] ;
D[ 0 ] = castOp ( t0) ; D[ 1 ] = castOp ( t1) ; D[ 2 ] = castOp ( t2) ;
}
else if ( cn == 4 )
for ( ; dx < X1; dx++ , D += 4 )
{
int sx = XY[ dx* 2 ] , sy = XY[ dx* 2 + 1 ] ;
const AT* w = wtab + FXY[ dx] * 4 ;
const T* S = S0 + sy* sstep + sx* 4 ;
WT t0 = S[ 0 ] * w[ 0 ] + S[ 4 ] * w[ 1 ] + S[ sstep] * w[ 2 ] + S[ sstep+ 4 ] * w[ 3 ] ;
WT t1 = S[ 1 ] * w[ 0 ] + S[ 5 ] * w[ 1 ] + S[ sstep+ 1 ] * w[ 2 ] + S[ sstep+ 5 ] * w[ 3 ] ;
D[ 0 ] = castOp ( t0) ; D[ 1 ] = castOp ( t1) ;
t0 = S[ 2 ] * w[ 0 ] + S[ 6 ] * w[ 1 ] + S[ sstep+ 2 ] * w[ 2 ] + S[ sstep+ 6 ] * w[ 3 ] ;
t1 = S[ 3 ] * w[ 0 ] + S[ 7 ] * w[ 1 ] + S[ sstep+ 3 ] * w[ 2 ] + S[ sstep+ 7 ] * w[ 3 ] ;
D[ 2 ] = castOp ( t0) ; D[ 3 ] = castOp ( t1) ;
}
else
for ( ; dx < X1; dx++ , D += cn )
{
int sx = XY[ dx* 2 ] , sy = XY[ dx* 2 + 1 ] ;
const AT* w = wtab + FXY[ dx] * 4 ;
const T* S = S0 + sy* sstep + sx* cn;
for ( int k = 0 ; k < cn; k++ )
{
WT t0 = S[ k] * w[ 0 ] + S[ k+ cn] * w[ 1 ] + S[ sstep+ k] * w[ 2 ] + S[ sstep+ k+ cn] * w[ 3 ] ;
D[ k] = castOp ( t0) ;
}
}
}
否则处理边界的情况。 BORDER_TRANSPARENT
直接跳过。
else
{
if ( borderType == BORDER_TRANSPARENT && cn != 3 )
{
D += ( X1 - dx) * cn;
dx = X1;
continue ;
}
单通道的常量边界、重复边界或者其他类型。 (sx
,sy
)为原图上的坐标。sx0
、sy0
、sx1
和sy1
是周围修剪后的坐标,用于插值。 cv::borderInterpolate 计算外推像素的源位置。
if ( cn == 1 )
for ( ; dx < X1; dx++ , D++ )
{
int sx = XY[ dx* 2 ] , sy = XY[ dx* 2 + 1 ] ;
if ( borderType == BORDER_CONSTANT &&
( sx >= ssize. width || sx+ 1 < 0 ||
sy >= ssize. height || sy+ 1 < 0 ) )
{
D[ 0 ] = cval[ 0 ] ;
}
else
{
int sx0, sx1, sy0, sy1;
T v0, v1, v2, v3;
const AT* w = wtab + FXY[ dx] * 4 ;
if ( borderType == BORDER_REPLICATE )
{
sx0 = clip ( sx, 0 , ssize. width) ;
sx1 = clip ( sx+ 1 , 0 , ssize. width) ;
sy0 = clip ( sy, 0 , ssize. height) ;
sy1 = clip ( sy+ 1 , 0 , ssize. height) ;
v0 = S0[ sy0* sstep + sx0] ;
v1 = S0[ sy0* sstep + sx1] ;
v2 = S0[ sy1* sstep + sx0] ;
v3 = S0[ sy1* sstep + sx1] ;
}
else
{
sx0 = borderInterpolate ( sx, ssize. width, borderType) ;
sx1 = borderInterpolate ( sx+ 1 , ssize. width, borderType) ;
sy0 = borderInterpolate ( sy, ssize. height, borderType) ;
sy1 = borderInterpolate ( sy+ 1 , ssize. height, borderType) ;
v0 = sx0 >= 0 && sy0 >= 0 ? S0[ sy0* sstep + sx0] : cval[ 0 ] ;
v1 = sx1 >= 0 && sy0 >= 0 ? S0[ sy0* sstep + sx1] : cval[ 0 ] ;
v2 = sx0 >= 0 && sy1 >= 0 ? S0[ sy1* sstep + sx0] : cval[ 0 ] ;
v3 = sx1 >= 0 && sy1 >= 0 ? S0[ sy1* sstep + sx1] : cval[ 0 ] ;
}
D[ 0 ] = castOp ( WT ( v0* w[ 0 ] + v1* w[ 1 ] + v2* w[ 2 ] + v3* w[ 3 ] ) ) ;
}
}
多通道的边界处理。
else
for ( ; dx < X1; dx++ , D += cn )
{
int sx = XY[ dx* 2 ] , sy = XY[ dx* 2 + 1 ] ;
if ( borderType == BORDER_CONSTANT &&
( sx >= ssize. width || sx+ 1 < 0 ||
sy >= ssize. height || sy+ 1 < 0 ) )
{
for ( int k = 0 ; k < cn; k++ )
D[ k] = cval[ k] ;
}
else
{
int sx0, sx1, sy0, sy1;
const T * v0, * v1, * v2, * v3;
const AT* w = wtab + FXY[ dx] * 4 ;
if ( borderType == BORDER_REPLICATE )
{
sx0 = clip ( sx, 0 , ssize. width) ;
sx1 = clip ( sx+ 1 , 0 , ssize. width) ;
sy0 = clip ( sy, 0 , ssize. height) ;
sy1 = clip ( sy+ 1 , 0 , ssize. height) ;
v0 = S0 + sy0* sstep + sx0* cn;
v1 = S0 + sy0* sstep + sx1* cn;
v2 = S0 + sy1* sstep + sx0* cn;
v3 = S0 + sy1* sstep + sx1* cn;
}
else if ( borderType == BORDER_TRANSPARENT &&
( ( unsigned ) sx >= ( unsigned ) ( ssize. width- 1 ) ||
( unsigned ) sy >= ( unsigned ) ( ssize. height- 1 ) ) )
continue ;
else
{
sx0 = borderInterpolate ( sx, ssize. width, borderType) ;
sx1 = borderInterpolate ( sx+ 1 , ssize. width, borderType) ;
sy0 = borderInterpolate ( sy, ssize. height, borderType) ;
sy1 = borderInterpolate ( sy+ 1 , ssize. height, borderType) ;
v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0* sstep + sx0* cn : & cval[ 0 ] ;
v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0* sstep + sx1* cn : & cval[ 0 ] ;
v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1* sstep + sx0* cn : & cval[ 0 ] ;
v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1* sstep + sx1* cn : & cval[ 0 ] ;
}
for ( int k = 0 ; k < cn; k++ )
D[ k] = castOp ( WT ( v0[ k] * w[ 0 ] + v1[ k] * w[ 1 ] + v2[ k] * w[ 2 ] + v3[ k] * w[ 3 ] ) ) ;
}
}
}
}
}
处理1、3或4通道的数据,并且跨度不能过大。 S0
为源图第0行,S1
为源图第1行。 对于单通道数据,wtab
指向输入的数组;否则指向 BilinearTab_iC4 数组。 D
为目的数据的指针。 INTER_REMAP_COEF_SCALE 为
2
15
2^{15}
2 15 delta
为
2
14
2^{14}
2 14 xy2ofs
的高低字节分别为源图上y
和x
元素的偏移。
int operator ( ) ( const Mat& _src, void * _dst, const short * XY,
const ushort* FXY, const void * _wtab, int width ) const
{
int cn = _src. channels ( ) , x = 0 , sstep = ( int ) _src. step;
if ( ( cn != 1 && cn != 3 && cn != 4 ) || sstep >= 0x8000 )
return 0 ;
const uchar * S0 = _src. ptr ( ) , * S1 = _src. ptr ( 1 ) ;
const short * wtab = cn == 1 ? ( const short * ) _wtab : & BilinearTab_iC4[ 0 ] [ 0 ] [ 0 ] ;
uchar* D = ( uchar* ) _dst;
v_int32x4 delta = v_setall_s32 ( INTER_REMAP_COEF_SCALE / 2 ) ;
v_int16x8 xy2ofs = v_reinterpret_as_s16 ( v_setall_s32 ( cn + ( sstep << 16 ) ) ) ;
int CV_DECL_ALIGNED ( 16 ) iofs0[ 4 ] , iofs1[ 4 ] ;
const uchar* src_limit_8bytes = _src. datalimit - v_int16x8:: nlanes;
# define CV_PICK_AND_PACK_RGB ( ptr, offset, result) \
{ \
const uchar* const p = ( ( const uchar* ) ptr) + ( offset) ; \
if ( p <= src_limit_8bytes) \
{ \
v_uint8x16 rrggbb, dummy; \
v_uint16x8 rrggbb8, dummy8; \
v_uint8x16 rgb0 = v_reinterpret_as_u8 ( v_int32x4 ( * ( unaligned_int* ) ( p) , 0 , 0 , 0 ) ) ; \
v_uint8x16 rgb1 = v_reinterpret_as_u8 ( v_int32x4 ( * ( unaligned_int* ) ( p + 3 ) , 0 , 0 , 0 ) ) ; \
v_zip ( rgb0, rgb1, rrggbb, dummy) ; \
v_expand ( rrggbb, rrggbb8, dummy8) ; \
result = v_reinterpret_as_s16 ( rrggbb8) ; \
} \
else \
{ \
result = v_int16x8 ( ( short ) p[ 0 ] , ( short ) p[ 3 ] , /* r0r1 */ \
( short ) p[ 1 ] , ( short ) p[ 4 ] , /* g0g1 */ \
( short ) p[ 2 ] , ( short ) p[ 5 ] , /* b0b1 */ 0 , 0 ) ; \
} \
}
# define CV_PICK_AND_PACK_RGBA ( ptr, offset, result) \
{ \
const uchar* const p = ( ( const uchar* ) ptr) + ( offset) ; \
CV_DbgAssert ( p <= src_limit_8bytes) ; \
v_uint8x16 rrggbbaa, dummy; \
v_uint16x8 rrggbbaa8, dummy8; \
v_uint8x16 rgba0 = v_reinterpret_as_u8 ( v_int32x4 ( * ( unaligned_int* ) ( p) , 0 , 0 , 0 ) ) ; \
v_uint8x16 rgba1 = v_reinterpret_as_u8 ( v_int32x4 ( * ( unaligned_int* ) ( p + v_int32x4:: nlanes) , 0 , 0 , 0 ) ) ; \
v_zip ( rgba0, rgba1, rrggbbaa, dummy) ; \
v_expand ( rrggbbaa, rrggbbaa8, dummy8) ; \
result = v_reinterpret_as_s16 ( rrggbbaa8) ; \
}
# define CV_PICK_AND_PACK4 ( base, offset) \
v_uint16x8 ( * ( unaligned_ushort* ) ( base + offset[ 0 ] ) , * ( unaligned_ushort* ) ( base + offset[ 1 ] ) , \
* ( unaligned_ushort* ) ( base + offset[ 2 ] ) , * ( unaligned_ushort* ) ( base + offset[ 3 ] ) , \
0 , 0 , 0 , 0 )
如果是单通道的数据。 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP 定义 v_load ,从内存加载到寄存器(vld1q_s16 )。 _xy0
和_xy1
是对应到源图上的坐标。 v0
、v1
、v2
和v3
是像素值, a0
、a1
、b0
和b1
是系数。
if ( cn == 1 )
{
for ( ; x <= width - 8 ; x += 8 )
{
v_int16x8 _xy0 = v_load ( XY + x* 2 ) ;
v_int16x8 _xy1 = v_load ( XY + x* 2 + 8 ) ;
v_int32x4 v0, v1, v2, v3, a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2;
v_dotprod 将两个寄存器中的值相乘,并对相邻的结果对求和。先调用 _v128_unzip ,后者会使用 vuzp1q_s16 和 vuzp2q_s16 解压向量。 xy0
和xy1
为对应到源图上的一维地址。 v_store 由 OPENCV_HAL_IMPL_NEON_LOADSTORE_OP 定义,将数据存储到内存(vst1q_s32 )。 保存到iofs0
和iofs1
这一步能否省掉?
v_int32x4 xy0 = v_dotprod( _xy0, xy2ofs );
v_int32x4 xy1 = v_dotprod( _xy1, xy2ofs );
v_store( iofs0, xy0 );
v_store( iofs1, xy1 );
CV_PICK_AND_PACK4 根据基地址和偏移地址获取内存中的4个值然后构造一个 v_uint16x8 对象。这里存在一半的浪费。 vec16
为源图上的对应像素,每个uint16
存储了相邻的两个像素。 v_reinterpret_as_u8 由 OPENCV_HAL_IMPL_NEON_INIT 定义,向量重新解释强制转换操作 (vreinterpretq_u8_u16 )。 v_expand 将寄存器的内容复制到两个2倍宽包装类型的寄存器中(vget_low_u8 +vmovl_u8 +vget_high_u8 )。 stub
包含真正的8个uint16
元素。 v0
再次将每对元素拼到一起。 加载下一行得到v1
。
v_uint16x8 stub, dummy;
v_uint16x8 vec16;
vec16 = CV_PICK_AND_PACK4 ( S0, iofs0) ;
v_expand ( v_reinterpret_as_u8 ( vec16) , stub, dummy) ;
v0 = v_reinterpret_as_s32 ( stub) ;
vec16 = CV_PICK_AND_PACK4 ( S1, iofs0) ;
v_expand ( v_reinterpret_as_u8 ( vec16) , stub, dummy) ;
v1 = v_reinterpret_as_s32 ( stub) ;
v_load_low 将数据加载到低位,高位未定义(vcombine_s32 +vld1_s32 +vdup_n_s32 )。 OPENCV_HAL_IMPL_NEON_UNPACKS 定义 v_zip ,从成对的两个源寄存器的下半部分读取相邻的向量元素,将这些对进行交织并写入目标寄存器(vzip1q_s16 )。 v_zip 交织两个向量。 wtab
原本为int16
,按照int32
加载使得第一行的两个系数成对。 a0
和b0
各自存储了两个像素的4个参数。加载方式比较低效? a1
和b1
中为垃圾值。 v_recombine 由 OPENCV_HAL_IMPL_NEON_UNPACKS 宏定义,分别合并来两个向量的较低部分和较高部分(vcombine_s32 +vget_low_s32 +vget_high_s32 )。 a2
和b2
为交换后的值,即4个像素第一行和第二行的系数。 v_dotprod 点元素的乘积并将第三个元素添加到相邻对的总和中。
f
(
x
,
y
)
=
(
a
0
α
0
+
a
1
α
1
)
β
0
+
(
b
0
α
0
+
b
1
α
1
)
β
1
=
a
0
α
0
β
0
+
a
1
α
1
β
0
+
b
0
α
0
β
1
+
b
1
α
1
β
1
=
f
(
Q
11
)
(
x
2
−
x
)
(
y
2
−
y
)
+
f
(
Q
12
)
(
x
−
x
1
)
(
y
2
−
y
)
+
f
(
Q
21
)
(
x
2
−
x
)
(
y
2
−
y
)
+
f
(
Q
22
)
(
x
−
x
1
)
(
y
−
y
1
)
\begin{aligned} f(x, y) &= (a_0\alpha_0+ a_1\alpha_1)\beta_0 + (b_0\alpha_0+ b_1\alpha_1)\beta_1\\ &= a_0\alpha_0\beta_0+ a_1\alpha_1\beta_0 + b_0\alpha_0\beta_1+ b_1\alpha_1\beta_1\\ &= f(Q_{11})(x_2 -x)(y_2 -y) + f(Q_{12})(x-x_1)(y_2 -y) + f(Q_{21})(x_2 -x)(y_2 -y) + f(Q_{22})(x-x_1)(y-y_1) \end{aligned}
f ( x , y ) = ( a 0 α 0 + a 1 α 1 ) β 0 + ( b 0 α 0 + b 1 α 1 ) β 1 = a 0 α 0 β 0 + a 1 α 1 β 0 + b 0 α 0 β 1 + b 1 α 1 β 1 = f ( Q 11 ) ( x 2 − x ) ( y 2 − y ) + f ( Q 12 ) ( x − x 1 ) ( y 2 − y ) + f ( Q 21 ) ( x 2 − x ) ( y 2 − y ) + f ( Q 22 ) ( x − x 1 ) ( y − y 1 )
v_zip ( v_load_low ( ( int * ) ( wtab + FXY[ x] * 4 ) ) , v_load_low ( ( int * ) ( wtab + FXY[ x + 1 ] * 4 ) ) , a0, a1) ;
v_zip ( v_load_low ( ( int * ) ( wtab + FXY[ x + 2 ] * 4 ) ) , v_load_low ( ( int * ) ( wtab + FXY[ x + 3 ] * 4 ) ) , b0, b1) ;
v_recombine ( a0, b0, a2, b2) ;
v1 = v_dotprod ( v_reinterpret_as_s16 ( v1) , v_reinterpret_as_s16 ( b2) , delta) ;
v0 = v_dotprod ( v_reinterpret_as_s16 ( v0) , v_reinterpret_as_s16 ( a2) , v1) ;
对于iofs1
重复以上操作得到v2
。
vec16 = CV_PICK_AND_PACK4 ( S0, iofs1) ;
v_expand ( v_reinterpret_as_u8 ( vec16) , stub, dummy) ;
v2 = v_reinterpret_as_s32 ( stub) ;
vec16 = CV_PICK_AND_PACK4 ( S1, iofs1) ;
v_expand ( v_reinterpret_as_u8 ( vec16) , stub, dummy) ;
v3 = v_reinterpret_as_s32 ( stub) ;
v_zip ( v_load_low ( ( int * ) ( wtab + FXY[ x + 4 ] * 4 ) ) , v_load_low ( ( int * ) ( wtab + FXY[ x + 5 ] * 4 ) ) , c0, c1) ;
v_zip ( v_load_low ( ( int * ) ( wtab + FXY[ x + 6 ] * 4 ) ) , v_load_low ( ( int * ) ( wtab + FXY[ x + 7 ] * 4 ) ) , d0, d1) ;
v_recombine ( c0, d0, c2, d2) ;
v3 = v_dotprod ( v_reinterpret_as_s16 ( v3) , v_reinterpret_as_s16 ( d2) , delta) ;
v2 = v_dotprod ( v_reinterpret_as_s16 ( v2) , v_reinterpret_as_s16 ( c2) , v3) ;
v_pack_u_store 由 OPENCV_HAL_IMPL_NEON_PACK 宏定义。 v_pack 将两个值缩短(vqmovn_s32 )后合并为一个较大的向量 (vcombine_s16 )。 v_pack_u_store 将两个值缩短(vqmovun_s16 )后存储到内存(vst1_u8 )。 v0
和v2
是对应的。
v0 = v0 >> INTER_REMAP_COEF_BITS;
v2 = v2 >> INTER_REMAP_COEF_BITS;
v_pack_u_store ( D + x, v_pack ( v0, v2) ) ;
}
}
如果是三通道数据。
else if ( cn == 3 )
{
for ( ; x <= width - 5 ; x += 4 , D += 12 )
{
v_int16x8 u0, v0, u1, v1;
v_int16x8 _xy0 = v_load ( XY + x * 2 ) ;
v_int32x4 xy0 = v_dotprod ( _xy0, xy2ofs) ;
v_store ( iofs0, xy0) ;
int offset0 = FXY[ x] * 16 ;
int offset1 = FXY[ x + 1 ] * 16 ;
int offset2 = FXY[ x + 2 ] * 16 ;
int offset3 = FXY[ x + 3 ] * 16 ;
v_int16x8 w00 = v_load ( wtab + offset0) ;
v_int16x8 w01 = v_load ( wtab + offset0 + 8 ) ;
v_int16x8 w10 = v_load ( wtab + offset1) ;
v_int16x8 w11 = v_load ( wtab + offset1 + 8 ) ;
CV_PICK_AND_PACK_RGB ( S0, iofs0[ 0 ] , u0) ;
CV_PICK_AND_PACK_RGB ( S1, iofs0[ 0 ] , v0) ;
CV_PICK_AND_PACK_RGB ( S0, iofs0[ 1 ] , u1) ;
CV_PICK_AND_PACK_RGB ( S1, iofs0[ 1 ] , v1) ;
v_int32x4 result0 = v_dotprod ( u0, w00, v_dotprod ( v0, w01, delta) ) >> INTER_REMAP_COEF_BITS;
v_int32x4 result1 = v_dotprod ( u1, w10, v_dotprod ( v1, w11, delta) ) >> INTER_REMAP_COEF_BITS;
result0 = v_rotate_left < 1 > ( result0) ;
v_int16x8 result8 = v_pack ( result0, result1) ;
v_uint8x16 result16 = v_pack_u ( result8, result8) ;
v_store_low ( D, v_rotate_right < 1 > ( result16) ) ;
w00 = v_load ( wtab + offset2) ;
w01 = v_load ( wtab + offset2 + 8 ) ;
w10 = v_load ( wtab + offset3) ;
w11 = v_load ( wtab + offset3 + 8 ) ;
CV_PICK_AND_PACK_RGB ( S0, iofs0[ 2 ] , u0) ;
CV_PICK_AND_PACK_RGB ( S1, iofs0[ 2 ] , v0) ;
CV_PICK_AND_PACK_RGB ( S0, iofs0[ 3 ] , u1) ;
CV_PICK_AND_PACK_RGB ( S1, iofs0[ 3 ] , v1) ;
result0 = v_dotprod ( u0, w00, v_dotprod ( v0, w01, delta) ) >> INTER_REMAP_COEF_BITS;
result1 = v_dotprod ( u1, w10, v_dotprod ( v1, w11, delta) ) >> INTER_REMAP_COEF_BITS;
result0 = v_rotate_left < 1 > ( result0) ;
result8 = v_pack ( result0, result1) ;
result16 = v_pack_u ( result8, result8) ;
v_store_low ( D + 6 , v_rotate_right < 1 > ( result16) ) ;
}
}
else if ( cn == 4 )
{
for ( ; x <= width - 4 ; x += 4 , D += 16 )
{
v_int16x8 _xy0 = v_load ( XY + x * 2 ) ;
v_int16x8 u0, v0, u1, v1;
v_int32x4 xy0 = v_dotprod ( _xy0, xy2ofs ) ;
v_store ( iofs0, xy0) ;
int offset0 = FXY[ x] * 16 ;
int offset1 = FXY[ x + 1 ] * 16 ;
int offset2 = FXY[ x + 2 ] * 16 ;
int offset3 = FXY[ x + 3 ] * 16 ;
v_int16x8 w00 = v_load ( wtab + offset0) ;
v_int16x8 w01 = v_load ( wtab + offset0 + 8 ) ;
v_int16x8 w10 = v_load ( wtab + offset1) ;
v_int16x8 w11 = v_load ( wtab + offset1 + 8 ) ;
CV_PICK_AND_PACK_RGBA ( S0, iofs0[ 0 ] , u0) ;
CV_PICK_AND_PACK_RGBA ( S1, iofs0[ 0 ] , v0) ;
CV_PICK_AND_PACK_RGBA ( S0, iofs0[ 1 ] , u1) ;
CV_PICK_AND_PACK_RGBA ( S1, iofs0[ 1 ] , v1) ;
v_int32x4 result0 = v_dotprod ( u0, w00, v_dotprod ( v0, w01, delta) ) >> INTER_REMAP_COEF_BITS;
v_int32x4 result1 = v_dotprod ( u1, w10, v_dotprod ( v1, w11, delta) ) >> INTER_REMAP_COEF_BITS;
v_int16x8 result8 = v_pack ( result0, result1) ;
v_pack_u_store ( D, result8) ;
w00 = v_load ( wtab + offset2) ;
w01 = v_load ( wtab + offset2 + 8 ) ;
w10 = v_load ( wtab + offset3) ;
w11 = v_load ( wtab + offset3 + 8 ) ;
CV_PICK_AND_PACK_RGBA ( S0, iofs0[ 2 ] , u0) ;
CV_PICK_AND_PACK_RGBA ( S1, iofs0[ 2 ] , v0) ;
CV_PICK_AND_PACK_RGBA ( S0, iofs0[ 3 ] , u1) ;
CV_PICK_AND_PACK_RGBA ( S1, iofs0[ 3 ] , v1) ;
result0 = v_dotprod ( u0, w00, v_dotprod ( v0, w01, delta) ) >> INTER_REMAP_COEF_BITS;
result1 = v_dotprod ( u1, w10, v_dotprod ( v1, w11, delta) ) >> INTER_REMAP_COEF_BITS;
result8 = v_pack ( result0, result1) ;
v_pack_u_store ( D + 8 , result8) ;
}
}
return x;
}
参考资料: