为什么 clang 优化会破坏我的内联汇编代码?

2024-02-26

为了尝试了解有关 ARM 汇编的知识,我编写了一个简单的测试项目来使用内联汇编和 NEON 指令执行图像缩小。你可以在这里看到它:

https://github.com/rmaz/NEON-Image-Downscaling https://github.com/rmaz/NEON-Image-Downscaling

经过一番努力,我终于让它工作了,快乐的日子。但它仅适用于低于 -O2 的优化级别。我查看了生成的 ASM,但我看不出发生这种情况的任何明显原因。谁能提供任何见解?这是负责内联汇编部分的函数:

static void inline resizeRow(uint32_t *dst, uint32_t *src, uint32_t pixelsPerRow)
{
    const uint32_t * rowB = src + pixelsPerRow;

    // force the number of pixels per row to a mutliple of 8
    pixelsPerRow = 8 * (pixelsPerRow / 8);    

    __asm__ volatile("Lresizeloop:                      \n" // start loop
                     "vld1.32       {d0-d3}, [%1]!      \n" // load 8 pixels from the top row
                     "vld1.32       {d4-d7}, [%2]!      \n" // load 8 pixels from the bottom row
                     "vhadd.u8      q0, q0, q2          \n" // average the pixels vertically
                     "vhadd.u8      q1, q1, q3          \n"
                     "vtrn.32       q0, q2              \n" // transpose to put the horizontally adjacent pixels in different registers
                     "vtrn.32       q1, q3              \n"
                     "vhadd.u8      q0, q0, q2          \n" // average the pixels horizontally
                     "vhadd.u8      q1, q1, q3          \n"
                     "vtrn.32       d0, d1              \n" // fill the registers with pixels
                     "vtrn.32       d2, d3              \n"
                     "vswp          d1, d2              \n"
                     "vst1.64       {d0-d1}, [%0]!      \n" // store the result
                     "subs          %3, %3, #8          \n" // subtract 8 from the pixel count
                     "bne           Lresizeloop         \n" // repeat until the row is complete
                     : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
                     : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
                     : "q0", "q1", "q2", "q3"
                     );
}

周围函数和循环在 O1 处生成的输出如下:

.align  2
    .code   16                      @ @"\01-[BDPViewController downscaleImageNeon:]"
    .thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
    .cfi_startproc
Lfunc_begin4:
    .loc    1 86 0                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
@ BB#0:
    .loc    1 86 1 prologue_end     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
    push    {r4, r5, r6, r7, lr}
    add r7, sp, #12
    push.w  {r8, r10, r11}
    sub sp, #20
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
    .loc    1 88 20                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
    movw    r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
    mov r6, r2
Ltmp43:
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
    movt    r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
    add r0, pc
    ldr.w   r11, [r0]
    mov r0, r6
    blx _objc_retain
    mov r4, r0
    mov r0, r6
    mov r1, r11
Ltmp44:
    blx _objc_msgSend
    blx _CGImageGetWidth
    mov r5, r0
Ltmp45:
    @DEBUG_VALUE: width <- R5+0
    .loc    1 89 21                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
    mov r0, r6
    mov r1, r11
    str r5, [sp, #16]           @ 4-byte Spill
    blx _objc_msgSend
    blx _CGImageGetHeight
    mov r10, r0
Ltmp46:
    @DEBUG_VALUE: height <- R10+0
    .loc    1 90 26                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    blx _CGImageGetBytesPerRow
    str r0, [sp, #12]           @ 4-byte Spill
Ltmp47:
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    .loc    1 91 35                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    blx _CGImageGetAlphaInfo
    str r0, [sp, #4]            @ 4-byte Spill
Ltmp48:
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    .loc    1 94 45                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    mov r6, r0
Ltmp49:
    mov r0, r4
    blx _objc_release
    mov r0, r6
    .loc    1 98 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
    mul r8, r10, r5
Ltmp50:
    @DEBUG_VALUE: width <- [sp+#16]+#0
    .loc    1 94 45                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
    blx _CGImageGetDataProvider
    blx _CGDataProviderCopyData
Ltmp51:
    @DEBUG_VALUE: data <- R0+0
    str r0, [sp, #8]            @ 4-byte Spill
Ltmp52:
    @DEBUG_VALUE: data <- [sp+#8]+#0
    .loc    1 95 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
    blx _CFDataGetBytePtr
    mov r4, r0
Ltmp53:
    @DEBUG_VALUE: buffer <- R4+0
    .loc    1 98 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
    lsr.w   r0, r8, #2
    movs    r1, #4
    blx _calloc
    mov r5, r0
Ltmp54:
    @DEBUG_VALUE: outputBuffer <- R5+0
    mov r0, r10
Ltmp55:
    @DEBUG_VALUE: height <- R0+0
    .loc    1 101 29                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    cmp r0, #0
Ltmp56:
    @DEBUG_VALUE: rowIndex <- 0+0
    beq LBB4_3
@ BB#1:                                 @ %.lr.ph
Ltmp57:
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: buffer <- R4+0
    @DEBUG_VALUE: outputBuffer <- R5+0
    @DEBUG_VALUE: rowIndex <- 0+0
    ldr r1, [sp, #12]           @ 4-byte Reload
Ltmp58:
    @DEBUG_VALUE: bytesPerRow <- R1+0
    mov.w   r8, #0
    lsl.w   r11, r1, #1
    .loc    1 104 74                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
    lsr.w   r10, r1, #1
Ltmp60:
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2:                                 @ =>This Inner Loop Header: Depth=1
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: outputBuffer <- R5+0
    @DEBUG_VALUE: rowIndex <- 0+0
    lsr.w   r1, r8, #1
Ltmp61:
    mov r6, r0
Ltmp62:
    @DEBUG_VALUE: height <- R6+0
    mla r0, r1, r10, r5
Ltmp63:
    @DEBUG_VALUE: destRow <- R1+0
    .loc    1 105 9                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
    ldr r2, [sp, #16]           @ 4-byte Reload
    mov r1, r4
Ltmp64:
    bl  _resizeRow
    mov r0, r6
Ltmp65:
    @DEBUG_VALUE: height <- R0+0
    .loc    1 101 50                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
    add.w   r8, r8, #2
Ltmp66:
    @DEBUG_VALUE: rowIndex <- R8+0
    .loc    1 101 29                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    add r4, r11
    cmp r8, r0
    blo LBB4_2
Ltmp67:
LBB4_3:                                 @ %._crit_edge
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: outputBuffer <- R5+0
    .loc    1 109 28                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
    ldr r1, [sp, #4]            @ 4-byte Reload
Ltmp68:
    lsrs    r2, r0, #1
    str r1, [sp]
    mov r6, r5
Ltmp69:
    @DEBUG_VALUE: outputBuffer <- R6+0
    ldr r1, [sp, #16]           @ 4-byte Reload
    ldr r0, [sp, #12]           @ 4-byte Reload
Ltmp70:
    lsrs    r1, r1, #1
    lsrs    r3, r0, #1
    mov r0, r5
    bl  _createBitmapContext
    mov r4, r0
Ltmp71:
    @DEBUG_VALUE: context <- R4+0
    .loc    1 110 30                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
    blx _CGBitmapContextCreateImage
    .loc    1 111 66                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
    movw    r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
    .loc    1 110 30                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
    mov r5, r0
Ltmp72:
    @DEBUG_VALUE: scaledImage <- R5+0
    .loc    1 111 66                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
    movt    r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
    movw    r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
    movt    r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
    add r1, pc
LPC4_2:
    add r0, pc
    mov r2, r5
    ldr r1, [r1]
    ldr r0, [r0]
    blx _objc_msgSend
Ltmp73:
    @DEBUG_VALUE: returnImage <- R0+0
    @ InlineAsm Start
    mov r7, r7      @ marker for objc_retainAutoreleaseReturnValue
    @ InlineAsm End
    blx _objc_retainAutoreleasedReturnValue
Ltmp74:
    mov r8, r0
    .loc    1 112 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
    mov r0, r5
    blx _CGImageRelease
    .loc    1 113 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
    mov r0, r4
    blx _CGContextRelease
    .loc    1 114 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
    ldr r0, [sp, #8]            @ 4-byte Reload
    blx _CFRelease
    .loc    1 115 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
    mov r0, r6
    blx _free
Ltmp75:
    .loc    1 118 1                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
    mov r0, r8
    add sp, #20
    pop.w   {r8, r10, r11}
    pop.w   {r4, r5, r6, r7, lr}
Ltmp76:
    b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
    .cfi_endproc

    .align  2
    .code   16                      @ @resizeRow
    .thumb_func _resizeRow
_resizeRow:
    .cfi_startproc
Lfunc_begin5:
    .loc    1 26 0                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
@ BB#0:
    @DEBUG_VALUE: resizeRow:dst <- R0+0
    @DEBUG_VALUE: resizeRow:src <- R1+0
    @DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
    .loc    1 27 47 prologue_end    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
    add.w   r3, r1, r2, lsl #2
Ltmp78:
    @DEBUG_VALUE: rowB <- R3+0
    .loc    1 30 5                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
    bic r2, r2, #7
Ltmp79:
    .loc    1 32 5                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
    @ InlineAsm Start
    Lresizeloop:                      
vld1.32       {d0-d3}, [r1]!      
vld1.32       {d4-d7}, [r3]!      
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       q0, q2              
vtrn.32       q1, q3              
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       d0, d1              
vtrn.32       d2, d3              
vswp          d1, d2              
vst1.64       {d0-d1}, [r0]!      
subs          r2, r2, #8          
bne           Lresizeloop         

    @ InlineAsm End
Ltmp80:
    .loc    1 51 1                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
    bx  lr
Ltmp81:
Lfunc_end5:
    .cfi_endproc

O2 处的非功能输出如下:

    .align  2
    .code   16                      @ @"\01-[BDPViewController downscaleImageNeon:]"
    .thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
    .cfi_startproc
Lfunc_begin4:
    .loc    1 86 0                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
@ BB#0:
    .loc    1 86 1 prologue_end     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
    push    {r4, r5, r6, r7, lr}
    add r7, sp, #12
    push.w  {r8, r10, r11}
    sub sp, #20
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
    .loc    1 88 20                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
    movw    r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
    mov r6, r2
Ltmp43:
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
    movt    r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
    add r0, pc
    ldr.w   r11, [r0]
    mov r0, r6
    blx _objc_retain
    mov r4, r0
    mov r0, r6
    mov r1, r11
Ltmp44:
    blx _objc_msgSend
    blx _CGImageGetWidth
    mov r5, r0
Ltmp45:
    @DEBUG_VALUE: width <- R5+0
    .loc    1 89 21                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
    mov r0, r6
    mov r1, r11
    str r5, [sp, #16]           @ 4-byte Spill
    blx _objc_msgSend
    blx _CGImageGetHeight
    mov r10, r0
Ltmp46:
    @DEBUG_VALUE: height <- R10+0
    .loc    1 90 26                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    blx _CGImageGetBytesPerRow
    str r0, [sp, #12]           @ 4-byte Spill
Ltmp47:
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    .loc    1 91 35                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    blx _CGImageGetAlphaInfo
    str r0, [sp, #4]            @ 4-byte Spill
Ltmp48:
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    .loc    1 94 45                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    mov r6, r0
Ltmp49:
    mov r0, r4
    blx _objc_release
    mov r0, r6
    .loc    1 98 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
    mul r8, r10, r5
Ltmp50:
    @DEBUG_VALUE: width <- [sp+#16]+#0
    .loc    1 94 45                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
    blx _CGImageGetDataProvider
    blx _CGDataProviderCopyData
Ltmp51:
    @DEBUG_VALUE: data <- R0+0
    str r0, [sp, #8]            @ 4-byte Spill
Ltmp52:
    @DEBUG_VALUE: data <- [sp+#8]+#0
    .loc    1 95 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
    blx _CFDataGetBytePtr
    mov r4, r0
Ltmp53:
    @DEBUG_VALUE: buffer <- R4+0
    .loc    1 98 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
    lsr.w   r0, r8, #2
    movs    r1, #4
    blx _calloc
    mov r5, r0
Ltmp54:
    @DEBUG_VALUE: outputBuffer <- R5+0
    mov r0, r10
Ltmp55:
    @DEBUG_VALUE: height <- R0+0
    .loc    1 101 29                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    cmp r0, #0
Ltmp56:
    @DEBUG_VALUE: rowIndex <- 0+0
    beq LBB4_3
@ BB#1:                                 @ %.lr.ph
Ltmp57:
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: buffer <- R4+0
    @DEBUG_VALUE: outputBuffer <- R5+0
    @DEBUG_VALUE: rowIndex <- 0+0
    ldr r1, [sp, #12]           @ 4-byte Reload
Ltmp58:
    @DEBUG_VALUE: bytesPerRow <- R1+0
    mov.w   r8, #0
    lsl.w   r11, r1, #1
    .loc    1 104 74                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
    lsr.w   r10, r1, #1
Ltmp60:
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2:                                 @ =>This Inner Loop Header: Depth=1
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: outputBuffer <- R5+0
    @DEBUG_VALUE: rowIndex <- 0+0
    lsr.w   r1, r8, #1
Ltmp61:
    mov r6, r0
Ltmp62:
    @DEBUG_VALUE: height <- R6+0
    mla r0, r1, r10, r5
Ltmp63:
    @DEBUG_VALUE: destRow <- R1+0
    .loc    1 105 9                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
    ldr r2, [sp, #16]           @ 4-byte Reload
    mov r1, r4
Ltmp64:
    bl  _resizeRow
    mov r0, r6
Ltmp65:
    @DEBUG_VALUE: height <- R0+0
    .loc    1 101 50                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
    add.w   r8, r8, #2
Ltmp66:
    @DEBUG_VALUE: rowIndex <- R8+0
    .loc    1 101 29                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    add r4, r11
    cmp r8, r0
    blo LBB4_2
Ltmp67:
LBB4_3:                                 @ %._crit_edge
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: outputBuffer <- R5+0
    .loc    1 109 28                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
    ldr r1, [sp, #4]            @ 4-byte Reload
Ltmp68:
    lsrs    r2, r0, #1
    str r1, [sp]
    mov r6, r5
Ltmp69:
    @DEBUG_VALUE: outputBuffer <- R6+0
    ldr r1, [sp, #16]           @ 4-byte Reload
    ldr r0, [sp, #12]           @ 4-byte Reload
Ltmp70:
    lsrs    r1, r1, #1
    lsrs    r3, r0, #1
    mov r0, r5
    bl  _createBitmapContext
    mov r4, r0
Ltmp71:
    @DEBUG_VALUE: context <- R4+0
    .loc    1 110 30                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
    blx _CGBitmapContextCreateImage
    .loc    1 111 66                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
    movw    r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
    .loc    1 110 30                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
    mov r5, r0
Ltmp72:
    @DEBUG_VALUE: scaledImage <- R5+0
    .loc    1 111 66                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
    movt    r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
    movw    r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
    movt    r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
    add r1, pc
LPC4_2:
    add r0, pc
    mov r2, r5
    ldr r1, [r1]
    ldr r0, [r0]
    blx _objc_msgSend
Ltmp73:
    @DEBUG_VALUE: returnImage <- R0+0
    @ InlineAsm Start
    mov r7, r7      @ marker for objc_retainAutoreleaseReturnValue
    @ InlineAsm End
    blx _objc_retainAutoreleasedReturnValue
Ltmp74:
    mov r8, r0
    .loc    1 112 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
    mov r0, r5
    blx _CGImageRelease
    .loc    1 113 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
    mov r0, r4
    blx _CGContextRelease
    .loc    1 114 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
    ldr r0, [sp, #8]            @ 4-byte Reload
    blx _CFRelease
    .loc    1 115 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
    mov r0, r6
    blx _free
Ltmp75:
    .loc    1 118 1                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
    mov r0, r8
    add sp, #20
    pop.w   {r8, r10, r11}
    pop.w   {r4, r5, r6, r7, lr}
Ltmp76:
    b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
    .cfi_endproc

    .align  2
    .code   16                      @ @resizeRow
    .thumb_func _resizeRow
_resizeRow:
    .cfi_startproc
Lfunc_begin5:
    .loc    1 26 0                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
@ BB#0:
    @DEBUG_VALUE: resizeRow:dst <- R0+0
    @DEBUG_VALUE: resizeRow:src <- R1+0
    @DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
    .loc    1 27 47 prologue_end    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
    add.w   r3, r1, r2, lsl #2
Ltmp78:
    @DEBUG_VALUE: rowB <- R3+0
    .loc    1 30 5                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
    bic r2, r2, #7
Ltmp79:
    .loc    1 32 5                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
    @ InlineAsm Start
    Lresizeloop:                      
vld1.32       {d0-d3}, [r1]!      
vld1.32       {d4-d7}, [r3]!      
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       q0, q2              
vtrn.32       q1, q3              
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       d0, d1              
vtrn.32       d2, d3              
vswp          d1, d2              
vst1.64       {d0-d1}, [r0]!      
subs          r2, r2, #8          
bne           Lresizeloop         

    @ InlineAsm End
Ltmp80:
    .loc    1 51 1                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
    bx  lr
Ltmp81:
Lfunc_end5:
    .cfi_endproc

这是我从您的 Xcode 项目中获得的汇编代码片段-O2。 (建筑与-O1不费心去内联该函数,所以我对它运行良好并不感到惊讶。)

Ltmp55:
    @DEBUG_VALUE: rowIndex <- R3+0
    .loc    1 101 29                @ /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    add r8, r12
    cmp r3, r11
    .loc    1 32 5                  @ /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
Ltmp56:
    @ InlineAsm Start
    Lresizeloop:                      
vld1.32       {d0-d3}, [r4]!      
vld1.32       {d4-d7}, [r5]!      
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       q0, q2              
vtrn.32       q1, q3              
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       d0, d1              
vtrn.32       d2, d3              
vswp          d1, d2              
vst1.64       {d0-d1}, [r6]!      
subs          r2, r2, #8          
bne           Lresizeloop         

    @ InlineAsm End
Ltmp57:
    blo LBB2_2

看到那个blo最后一行的(如果较低则分支)指令?它使用由cmp r3, r11在装配块的顶部。但是当然,那时您的内联汇编代码已经完全破坏了条件代码寄存器。那么这是一个编译器错误吗?...Nope!你只是忘记了tell编译器认为您的内联汇编代码破坏了条件代码。代替

                 : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
                 : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
                 : "q0", "q1", "q2", "q3"
                 );

with

                 : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
                 : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
                 : "q0", "q1", "q2", "q3", "cc"
                 );

并且汇编输出会自行修复。我还没有运行该应用程序,但我打赌您会发现现在一切都好多了。 :)

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

为什么 clang 优化会破坏我的内联汇编代码? 的相关文章

随机推荐