我正在学习如何在 OpenMP/Fortran 中使用 SIMD 指令。我
写了简单的代码:
program loop
implicit none
integer :: i,j
real*8 :: x
x = 0.0
do i=1,10000
do j=1,10000000
x = x + 1.0/(1.0*i)
enddo
enddo
print*, x
end program loop
当我编译这段代码并运行它时,我得到:
ifort -O3 -vec-report3 -xhost loop_simd.f90
loop_simd.f90(10): (col. 12) remark: LOOP WAS VECTORIZED
loop_simd.f90(9): (col. 7) remark: loop was not vectorized: not inner loop
time ./a.out
97876060.8355515
real 0m8.940s
user 0m8.937s
sys 0m0.005s
我做了编译器关于“非内部循环”的建议,并且
添加了 SIMD crash(2) 指令:
program loop
implicit none
integer :: i,j
real*8 :: x
x = 0.0
!$omp simd collapse(2) reduction(+:x)
do i=1,10000
do j=1,10000000
x = x + 1.0/(1.0*i)
enddo
enddo
print*, x
end program loop
然后我再次编译并运行代码,得到以下结果
输出:
ifort -O3 -vec-report3 -openmp -xhost loop_simd.f90
loop_simd.f90(8): (col. 7) remark: OpenMP SIMD LOOP WAS VECTORIZED
time ./a.out
97876054.9903757
real 0m26.535s
user 0m26.540s
sys 0m0.003s
我不知道为什么SIMD性能会下降?
什么时候 SIMD 会比标准 Fortran 代码更好?
.section .text
.LNDBG_TX:
# mark_description "Intel(R) Fortran Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 14.0.2.144 Build 2";
# mark_description "0140120";
# mark_description "-O3 -vec-report3 -openmp -xhost -S";
.file "loop_simd.f90"
.text
..TXTST0:
L__routine_start_MAIN___0:
# -- Begin MAIN__
# mark_begin;
.align 16,0x90
.globl MAIN__
MAIN__:
..B1.1: # Preds ..B1.0
..___tag_value_MAIN__.1: #1.9
..LN0:
.file 1 "loop_simd.f90"
.loc 1 1 is_stmt 1
pushq %rbp #1.9
..___tag_value_MAIN__.3: #
..LN1:
movq %rsp, %rbp #1.9
..___tag_value_MAIN__.4: #
..LN2:
andq $-128, %rsp #1.9
..LN3:
subq $128, %rsp #1.9
..LN4:
movq $0x0000117fe, %rsi #1.9
..LN5:
movl $3, %edi #1.9
..LN6:
call __intel_new_feature_proc_init #1.9
..LN7:
# LOE rbx r12 r13 r14 r15
..B1.12: # Preds ..B1.1
..LN8:
vstmxcsr (%rsp) #1.9
..LN9:
movl $.2.3_2_kmpc_loc_struct_pack.1, %edi #1.9
..LN10:
xorl %esi, %esi #1.9
..LN11:
orl $32832, (%rsp) #1.9
..LN12:
xorl %eax, %eax #1.9
..LN13:
vldmxcsr (%rsp) #1.9
..___tag_value_MAIN__.6: #1.9
..LN14:
call __kmpc_begin #1.9
..___tag_value_MAIN__.7: #
..LN15:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.12
..LN16:
movl $__NLITPACK_0.0.1, %edi #1.9
..LN17:
call for_set_reentrancy #1.9
..LN18:
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
..LN19:
.loc 1 8 is_stmt 1
movl $4, %eax #8.7
..LN20:
.loc 1 6 is_stmt 1
vxorpd %ymm2, %ymm2, %ymm2 #6.7
..LN21:
.loc 1 8 is_stmt 1
vmovd %eax, %xmm0 #8.7
..LN22:
xorl %eax, %eax #8.7
..LN23:
vpshufd $0, %xmm0, %xmm1 #8.7
..LN24:
vmovdqu .L_2il0floatpacket.19(%rip), %xmm0 #8.7
..LN25:
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm1 ymm2
..B1.4: # Preds ..B1.6 ..B1.3
..LN26:
.loc 1 11 is_stmt 1
vcvtdq2ps %xmm0, %xmm3 #11.34
..LN27:
vrcpps %xmm3, %xmm5 #11.28
..LN28:
vmulps %xmm3, %xmm5, %xmm4 #11.28
..LN29:
vaddps %xmm5, %xmm5, %xmm6 #11.28
..LN30:
vmulps %xmm5, %xmm4, %xmm7 #11.28
..LN31:
.loc 1 10 is_stmt 1
xorl %edx, %edx #10.12
..LN32:
.loc 1 11 is_stmt 1
vsubps %xmm7, %xmm6, %xmm8 #11.28
..LN33:
vcvtps2pd %xmm8, %ymm3 #11.28
..LN34:
# LOE rbx r12 r13 r14 r15 eax edx xmm0 xmm1 ymm2 ymm3
..B1.5: # Preds ..B1.5 ..B1.4
..LN35:
.loc 1 10 is_stmt 1
incl %edx #10.12
..LN36:
.loc 1 11 is_stmt 1
vaddpd %ymm3, %ymm2, %ymm2 #11.17
..LN37:
.loc 1 10 is_stmt 1
cmpl $10000000, %edx #10.12
..LN38:
jb ..B1.5 # Prob 99% #10.12
..LN39:
# LOE rbx r12 r13 r14 r15 eax edx xmm0 xmm1 ymm2 ymm3
..B1.6: # Preds ..B1.5
..LN40:
.loc 1 8 is_stmt 1
addl $4, %eax #8.7
..LN41:
.loc 1 10 is_stmt 1
vpaddd %xmm1, %xmm0, %xmm0 #10.12
..LN42:
.loc 1 8 is_stmt 1
cmpl $10000, %eax #8.7
..LN43:
jb ..B1.4 # Prob 66% #8.7
..LN44:
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm1 ymm2
..B1.7: # Preds ..B1.6
..LN45:
.loc 1 6 is_stmt 1
..LN46:
.loc 1 15 is_stmt 1
lea (%rsp), %rdi #15.7
..LN47:
.loc 1 6 is_stmt 1
vextractf128 $1, %ymm2, %xmm0 #6.7
..LN48:
.loc 1 15 is_stmt 1
movl $-1, %esi #15.7
..LN49:
.loc 1 6 is_stmt 1
vaddpd %xmm0, %xmm2, %xmm1 #6.7
..LN50:
vunpckhpd %xmm1, %xmm1, %xmm3 #6.7
..LN51:
.loc 1 15 is_stmt 1
lea 64(%rsp), %r8 #15.7
..LN52:
movq $0x1208384ff00, %rdx #15.7
..LN53:
movl $__STRLITPACK_0.0.1, %ecx #15.7
..LN54:
xorl %eax, %eax #15.7
..LN55:
.loc 1 6 is_stmt 1
vaddsd %xmm3, %xmm1, %xmm4 #6.7
..LN56:
.loc 1 15 is_stmt 1
vmovsd %xmm4, 64(%rsp) #15.7
..LN57:
movq $0, (%rsp) #15.7
..LN58:
vzeroupper #15.7
..LN59:
call for_write_seq_lis #15.7
..LN60:
# LOE rbx r12 r13 r14 r15
..B1.8: # Preds ..B1.7
..LN61:
.loc 1 18 is_stmt 1
movl $.2.3_2_kmpc_loc_struct_pack.12, %edi #18.1
..LN62:
xorl %eax, %eax #18.1
..___tag_value_MAIN__.8: #18.1
..LN63:
call __kmpc_end #18.1
..___tag_value_MAIN__.9: #
..LN64:
# LOE rbx r12 r13 r14 r15
..B1.9: # Preds ..B1.8
..LN65:
movl $1, %eax #18.1
..LN66:
movq %rbp, %rsp #18.1
..LN67:
popq %rbp #18.1
..___tag_value_MAIN__.10: #
..LN68:
ret #18.1
.align 16,0x90
..___tag_value_MAIN__.12: #
..LN69:
# LOE
..LN70:
# mark_end;
.type MAIN__,@function
.size MAIN__,.-MAIN__
..LNMAIN__.71:
.LNMAIN__:
.data
.align 4
.align 4
.2.3_2_kmpc_loc_struct_pack.1:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.0
.align 4
.2.3_2__kmpc_loc_pack.0:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 77
.byte 65
.byte 73
.byte 78
.byte 95
.byte 95
.byte 59
.byte 49
.byte 59
.byte 49
.byte 59
.byte 59
.space 3, 0x00 # pad
.align 4
.2.3_2_kmpc_loc_struct_pack.12:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.11
.align 4
.2.3_2__kmpc_loc_pack.11:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 77
.byte 65
.byte 73
.byte 78
.byte 95
.byte 95
.byte 59
.byte 49
.byte 56
.byte 59
.byte 49
.byte 56
.byte 59
.byte 59
.section .rodata, "a"
.align 16
.align 8
__NLITPACK_0.0.1:
.long 0x00000002,0x00000000
.align 4
__STRLITPACK_0.0.1:
.byte 48
.byte 1
.byte 1
.byte 0
.byte 0
.data
# -- End MAIN__
.section .rodata, "a"
.space 3, 0x00 # pad
.align 16
.L_2il0floatpacket.19:
.long 0x00000001,0x00000002,0x00000003,0x00000004
.type .L_2il0floatpacket.19,@object
.size .L_2il0floatpacket.19,16
.align 16
.L_2il0floatpacket.20:
.long 0x3f800000,0x3f800000,0x3f800000,0x3f800000
.type .L_2il0floatpacket.20,@object
.size .L_2il0floatpacket.20,16
.data
.section .note.GNU-stack, ""
# End
非 openmp 代码的 ASM 输出
.section .text
.LNDBG_TX:
# mark_description "Intel(R) Fortran Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 14.0.2.144 Build 2";
# mark_description "0140120";
# mark_description "-O3 -vec-report3 -xhost -S";
.file "loop_simd.f90"
.text
..TXTST0:
L__routine_start_MAIN___0:
# -- Begin MAIN__
# mark_begin;
.align 16,0x90
.globl MAIN__
MAIN__:
..B1.1: # Preds ..B1.0
..___tag_value_MAIN__.1: #1.9
..LN0:
.file 1 "loop_simd.f90"
.loc 1 1 is_stmt 1
pushq %rbp #1.9
..___tag_value_MAIN__.3: #
..LN1:
movq %rsp, %rbp #1.9
..___tag_value_MAIN__.4: #
..LN2:
andq $-128, %rsp #1.9
..LN3:
subq $128, %rsp #1.9
..LN4:
movq $0x0000117fe, %rsi #1.9
..LN5:
movl $3, %edi #1.9
..LN6:
call __intel_new_feature_proc_init #1.9
..LN7:
# LOE rbx r12 r13 r14 r15
..B1.10: # Preds ..B1.1
..LN8:
vstmxcsr (%rsp) #1.9
..LN9:
movl $__NLITPACK_0.0.1, %edi #1.9
..LN10:
orl $32832, (%rsp) #1.9
..LN11:
vldmxcsr (%rsp) #1.9
..LN12:
call for_set_reentrancy #1.9
..LN13:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.10
..LN14:
.loc 1 6 is_stmt 1
..LN15:
.loc 1 11 is_stmt 1
vmovss .L_2il0floatpacket.0(%rip), %xmm6 #11.28
..LN16:
.loc 1 9 is_stmt 1
xorl %eax, %eax #9.7
..LN17:
.loc 1 6 is_stmt 1
vxorpd %ymm8, %ymm8, %ymm8 #6.7
..LN18:
vmovapd %ymm8, %ymm7 #6.7
..LN19:
vmovapd %ymm8, %ymm0 #6.7
..LN20:
vmovapd %ymm8, %ymm1 #6.7
..LN21:
vmovapd %ymm8, %ymm2 #6.7
..LN22:
vmovapd %ymm8, %ymm3 #6.7
..LN23:
vmovapd %ymm8, %ymm4 #6.7
..LN24:
vmovapd %ymm8, %ymm5 #6.7
..LN25:
# LOE rbx r12 r13 r14 r15 eax xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8
..B1.3: # Preds ..B1.5 ..B1.2
..LN26:
incl %eax #
..LN27:
.loc 1 11 is_stmt 1
vxorps %xmm9, %xmm9, %xmm9 #11.28
..LN28:
vcvtsi2ss %eax, %xmm9, %xmm9 #11.28
..LN29:
vdivss %xmm9, %xmm6, %xmm10 #11.28
..LN30:
vcvtss2sd %xmm10, %xmm10, %xmm10 #11.28
..LN31:
vmovddup %xmm10, %xmm11 #11.28
..LN32:
.loc 1 10 is_stmt 1
xorl %edx, %edx #10.12
..LN33:
.loc 1 11 is_stmt 1
vinsertf128 $1, %xmm11, %ymm11, %ymm9 #11.28
..LN34:
# LOE rbx r12 r13 r14 r15 eax edx xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8 ymm9
..B1.4: # Preds ..B1.4 ..B1.3
..LN35:
.loc 1 10 is_stmt 1
addl $32, %edx #10.12
..LN36:
.loc 1 11 is_stmt 1
vaddpd %ymm9, %ymm8, %ymm8 #11.17
..LN37:
vaddpd %ymm7, %ymm9, %ymm7 #11.17
..LN38:
vaddpd %ymm0, %ymm9, %ymm0 #11.17
..LN39:
vaddpd %ymm1, %ymm9, %ymm1 #11.17
..LN40:
vaddpd %ymm2, %ymm9, %ymm2 #11.17
..LN41:
vaddpd %ymm3, %ymm9, %ymm3 #11.17
..LN42:
vaddpd %ymm4, %ymm9, %ymm4 #11.17
..LN43:
vaddpd %ymm5, %ymm9, %ymm5 #11.17
..LN44:
.loc 1 10 is_stmt 1
cmpl $10000000, %edx #10.12
..LN45:
jb ..B1.4 # Prob 99% #10.12
..LN46:
# LOE rbx r12 r13 r14 r15 eax edx xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8 ymm9
..B1.5: # Preds ..B1.4
..LN47:
.loc 1 9 is_stmt 1
cmpl $10000, %eax #9.7
..LN48:
jb ..B1.3 # Prob 66% #9.7
..LN49:
# LOE rbx r12 r13 r14 r15 eax xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8
..B1.6: # Preds ..B1.5
..LN50:
.loc 1 6 is_stmt 1
vaddpd %ymm7, %ymm8, %ymm6 #6.7
..LN51:
.loc 1 15 is_stmt 1
lea (%rsp), %rdi #15.7
..LN52:
.loc 1 6 is_stmt 1
vaddpd %ymm1, %ymm0, %ymm0 #6.7
..LN53:
vaddpd %ymm3, %ymm2, %ymm1 #6.7
..LN54:
vaddpd %ymm5, %ymm4, %ymm2 #6.7
..LN55:
vaddpd %ymm0, %ymm6, %ymm3 #6.7
..LN56:
vaddpd %ymm2, %ymm1, %ymm4 #6.7
..LN57:
vaddpd %ymm4, %ymm3, %ymm5 #6.7
..LN58:
.loc 1 15 is_stmt 1
movl $-1, %esi #15.7
..LN59:
movq $0x1208384ff00, %rdx #15.7
..LN60:
movl $__STRLITPACK_0.0.1, %ecx #15.7
..LN61:
xorl %eax, %eax #15.7
..LN62:
lea 64(%rsp), %r8 #15.7
..LN63:
movq $0, (%rsp) #15.7
..LN64:
.loc 1 6 is_stmt 1
vextractf128 $1, %ymm5, %xmm7 #6.7
..LN65:
vaddpd %xmm7, %xmm5, %xmm8 #6.7
..LN66:
vunpckhpd %xmm8, %xmm8, %xmm9 #6.7
..LN67:
vaddsd %xmm9, %xmm8, %xmm10 #6.7
..LN68:
.loc 1 15 is_stmt 1
vmovsd %xmm10, 64(%rsp) #15.7
..LN69:
vzeroupper #15.7
..LN70:
call for_write_seq_lis #15.7
..LN71:
# LOE rbx r12 r13 r14 r15
..B1.7: # Preds ..B1.6
..LN72:
.loc 1 18 is_stmt 1
movl $1, %eax #18.1
..LN73:
movq %rbp, %rsp #18.1
..LN74:
popq %rbp #18.1
..___tag_value_MAIN__.6: #
..LN75:
ret #18.1
.align 16,0x90
..___tag_value_MAIN__.8: #
..LN76:
# LOE
..LN77:
# mark_end;
.type MAIN__,@function
.size MAIN__,.-MAIN__
..LNMAIN__.78:
.LNMAIN__:
.section .rodata, "a"
.align 8
.align 8
__NLITPACK_0.0.1:
.long 0x00000000,0x00000000
.align 4
__STRLITPACK_0.0.1:
.byte 48
.byte 1
.byte 1
.byte 0
.byte 0
.data
# -- End MAIN__
.section .rodata, "a"
.space 3, 0x00 # pad
.align 4
.L_2il0floatpacket.0:
.long 0x3f800000
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,4
.data
.section .note.GNU-stack, ""
# End