向编译器询问想法:编译int64_t neg(int64_t a) { return -a; }
在 32 位模式下。当然,询问编译器的不同方式将在内存中、编译器选择的寄存器中或已经在 EDX:EAX 中获得起始值。查看全部三种方式在 Godbolt 编译器资源管理器上 https://gcc.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(j:1,source:'%23include+%3Cstdint.h%3E%0A%0Aint64_t+neg_value_from_mem(int64_t+a)+%7B%0A+++++return+-a%3B%0A%7D%0A%0Aint64_t+neg_value_in_regs(int64_t+a)+%7B%0A++++//+The+OR+makes+the+compiler+load%2BOR+first%0A++++//+but+it+can+choose+regs+to+set+up+for+the+negate%0A++++int64_t+reg+%3D+a+%7C+0x1111111111LL%3B%0A++++//+clang+chooses+mov+reg,mem+++/+or+reg,imm8+when+possible,%0A++++//+otherwise+++++mov+reg,imm32+/+or+reg,mem.++Neat+:)%0A++++return+-reg%3B%0A%7D%0A%0A//int64_t+__attribute__((noinline))+foo()%7Breturn+0%3B%7D++//+in+case+you+want+to+compile+to+a+linked+binary%0Aint64_t+foo()%3B%0Aint64_t+neg_value_in_place(void)%0A%7B+++//+foo!'s+return+value+will+be+in+edx:eax%0A++++int64_t+a+%3D+foo()%3B%0A++++return+-a%3B%0A%7D'),l:'5',n:'0',o:'C%2B%2B+source+%231',t:'0')),k:36.372406215515106,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((g:!((h:compiler,i:(compiler:g63,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-xc+-m32+-Wall+-Wextra+-O3++-mtune%3Dhaswell+-fverbose-asm',source:1),l:'5',n:'0',o:'x86-64+gcc+6.3+(Editor+%231,+Compiler+%231)',t:'0')),k:30.29426045115158,l:'4',m:69.94100486990892,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:cl19_32,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-Ox',source:1),l:'5',n:'0',o:'x86+CL+19+2017+RTW+(Editor+%231,+Compiler+%233)',t:'0')),l:'4',m:30.05899513009107,n:'0',o:'',s:0,t:'0')),k:30.29426045115158,l:'3',n:'0',o:'',t:'0'),(g:!((h:compiler,i:(compiler:clang391,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-xc+-m32+-Wall+-Wextra+-O3++-mtune%3Dhaswell+-fverbose-asm',source:1),l:'5',n:'0',o:'x86-64+clang+3.9.1+(Editor+%231,+Compiler+%232)',t:'0')),k:33.33333333333333,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4,带有 gcc、clang 和 MSVC(又名 CL)的 asm 输出。
当然有很多方法可以实现这一点,但任何可能的序列在某个时刻都需要某种从低到高的进位,因此没有有效的方法来避免 SBB 或 ADC。
如果该值在内存中开始,或者您想保留寄存器中的原始值,对目标进行异或清零并使用 SUB/SBB。 SysV x86-32 ABI 在堆栈上传递参数并在 EDX:EAX 中返回 64 位整数。这是什么铿锵3.9.1-m32 -O3 does https://gcc.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(j:1,source:'%23include+%3Cstdint.h%3E%0A%0Aint64_t+neg_value_from_mem(int64_t+a)+%7B%0A+++++return+-a%3B%0A%7D%0A%0Aint64_t+neg_value_in_regs(int64_t+a)+%7B%0A++++//+The+OR+makes+the+compiler+load%2BOR+first%0A++++//+but+it+can+choose+regs+to+set+up+for+the+negate%0A++++int64_t+reg+%3D+a+%7C+0x1111111111LL%3B%0A++++//+clang+chooses+mov+reg,mem+++/+or+reg,imm8+when+possible,%0A++++//+otherwise+++++mov+reg,imm32+/+or+reg,mem.++Neat+:)%0A++++return+-reg%3B%0A%7D%0A%0A//int64_t+__attribute__((noinline))+foo()%7Breturn+0%3B%7D++//+in+case+you+want+to+compile+to+a+linked+binary%0Aint64_t+foo()%3B%0Aint64_t+neg_value_in_place(void)%0A%7B+++//+foo!'s+return+value+will+be+in+edx:eax%0A++++int64_t+a+%3D+foo()%3B%0A++++return+-a%3B%0A%7D'),l:'5',n:'0',o:'C%2B%2B+source+%231',t:'0')),k:36.372406215515106,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((g:!((h:compiler,i:(compiler:g63,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-xc+-m32+-Wall+-Wextra+-O3++-mtune%3Dhaswell+-fverbose-asm',source:1),l:'5',n:'0',o:'x86-64+gcc+6.3+(Editor+%231,+Compiler+%231)',t:'0')),k:30.29426045115158,l:'4',m:69.94100486990892,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:cl19_32,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-Ox',source:1),l:'5',n:'0',o:'x86+CL+19+2017+RTW+(Editor+%231,+Compiler+%233)',t:'0')),l:'4',m:30.05899513009107,n:'0',o:'',s:0,t:'0')),k:30.29426045115158,l:'3',n:'0',o:'',t:'0'),(g:!((h:compiler,i:(compiler:clang391,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-xc+-m32+-Wall+-Wextra+-O3++-mtune%3Dhaswell+-fverbose-asm',source:1),l:'5',n:'0',o:'x86-64+clang+3.9.1+(Editor+%231,+Compiler+%232)',t:'0')),k:33.33333333333333,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4, for neg_value_from_mem
:
; optimal for data coming from memory: just subtract from zero
xor eax, eax
xor edx, edx
sub eax, dword ptr [esp + 4]
sbb edx, dword ptr [esp + 8]
如果寄存器中有值并且不需要就地结果, 您可以使用NEG http://www.felixcloutier.com/x86/NEG.html将寄存器设置为 0 - 本身,当且仅当输入非零时设置 CF。即与 SUB 相同的方式。注意异或归零很便宜 https://stackoverflow.com/a/33668295/224132,而不是延迟关键路径的一部分,因此这绝对比 gcc 的 3 指令序列(如下)更好。
;; partially in-place: input in ecx:eax
xor edx, edx
neg eax ; eax = 0-eax, setting flags appropriately
sbb edx, ecx ;; result in edx:eax
即使对于就地情况,Clang 也会这样做,尽管这会花费额外的费用mov ecx,edx
。这对于具有零延迟 mov reg,reg (Intel IvB+ 和 AMD Zen)的现代 CPU 上的延迟来说是最佳的,但对于融合域 uops(前端吞吐量)或代码大小的数量来说不是最佳的。
gcc 的序列很有趣,但并不完全明显。对于就地情况,与 clang 相比,它节省了一条指令,但否则情况会更糟。
; gcc's in-place sequence, only good for in-place use
neg eax
adc edx, 0
neg edx
; disadvantage: higher latency for the upper half than subtract-from-zero
; advantage: result in edx:eax with no extra registers used
不幸的是,gcc 和 MSVC 都总是使用这个,即使 xor-zero + sub/sbb 会更好。
要更完整地了解编译器的功能,请查看这些函数的输出(在上帝螺栓上 https://gcc.godbolt.org/#g:!((g:!((g:!((h:codeEditor,i:(j:1,source:'%23include+%3Cstdint.h%3E%0A%0Aint64_t+neg_value_from_mem(int64_t+a)+%7B%0A+++++return+-a%3B%0A%7D%0A%0Aint64_t+neg_value_in_regs(int64_t+a)+%7B%0A++++//+The+OR+makes+the+compiler+load%2BOR+first%0A++++//+but+it+can+choose+regs+to+set+up+for+the+negate%0A++++int64_t+reg+%3D+a+%7C+0x1111111111LL%3B%0A++++//+clang+chooses+mov+reg,mem+++/+or+reg,imm8+when+possible,%0A++++//+otherwise+++++mov+reg,imm32+/+or+reg,mem.++Neat+:)%0A++++return+-reg%3B%0A%7D%0A%0A//int64_t+__attribute__((noinline))+foo()%7Breturn+0%3B%7D++//+in+case+you+want+to+compile+to+a+linked+binary%0Aint64_t+foo()%3B%0Aint64_t+neg_value_in_place(void)%0A%7B+++//+foo!'s+return+value+will+be+in+edx:eax%0A++++int64_t+a+%3D+foo()%3B%0A++++return+-a%3B%0A%7D'),l:'5',n:'0',o:'C%2B%2B+source+%231',t:'0')),k:36.372406215515106,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((g:!((h:compiler,i:(compiler:g63,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-xc+-m32+-Wall+-Wextra+-O3++-mtune%3Dhaswell+-fverbose-asm',source:1),l:'5',n:'0',o:'x86-64+gcc+6.3+(Editor+%231,+Compiler+%231)',t:'0')),k:30.29426045115158,l:'4',m:69.94100486990892,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:cl19_32,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-Ox',source:1),l:'5',n:'0',o:'x86+CL+19+2017+RTW+(Editor+%231,+Compiler+%233)',t:'0')),l:'4',m:30.05899513009107,n:'0',o:'',s:0,t:'0')),k:30.29426045115158,l:'3',n:'0',o:'',t:'0'),(g:!((h:compiler,i:(compiler:clang391,filters:(b:'0',commentOnly:'0',directives:'0',intel:'0'),options:'-xc+-m32+-Wall+-Wextra+-O3++-mtune%3Dhaswell+-fverbose-asm',source:1),l:'5',n:'0',o:'x86-64+clang+3.9.1+(Editor+%231,+Compiler+%232)',t:'0')),k:33.33333333333333,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)
#include <stdint.h>
int64_t neg_value_from_mem(int64_t a) {
return -a;
}
int64_t neg_value_in_regs(int64_t a) {
// The OR makes the compiler load+OR first
// but it can choose regs to set up for the negate
int64_t reg = a | 0x1111111111LL;
// clang chooses mov reg,mem / or reg,imm8 when possible,
// otherwise mov reg,imm32 / or reg,mem. Nice :)
return -reg;
}
int64_t foo();
int64_t neg_value_in_place(int64_t a) {
// foo's return value will be in edx:eax
return -foo();
}