From c2066d19cda20d0e98b95da1493e6c3f26fd9618 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Wed, 3 Aug 2022 16:38:34 +0800 Subject: [PATCH] [X86][MC] Always emit `rep` prefix for `bsf` `BMI` new instruction `tzcnt` has better performance than `bsf` on new processors. Its encoding has a mandatory prefix '0xf3' compared to `bsf`. If we force emit `rep` prefix for `bsf`, we will gain better performance when the same code run on new processors. GCC has already done this way: https://c.godbolt.org/z/6xere6fs1 Fixes #34191 Reviewed By: skan Differential Revision: https://reviews.llvm.org/D130956 --- llvm/lib/Target/X86/X86MCInstLower.cpp | 4 + llvm/test/CodeGen/X86/clz.ll | 110 +++++++++++++++------ llvm/test/CodeGen/X86/dagcombine-select.ll | 16 +-- .../CodeGen/X86/peephole-na-phys-copy-folding.ll | 2 + llvm/test/CodeGen/X86/stack-folding-x86_64.ll | 4 +- 5 files changed, 97 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 0972e80..4c33646 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -982,6 +982,10 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) std::swap(OutMI.getOperand(1), OutMI.getOperand(2)); } + // Add an REP prefix to BSF instructions so that new processors can + // recognize as TZCNT, which has better performance than BSF. + if (X86::isBSF(OutMI.getOpcode()) && !MF.getFunction().hasOptSize()) + OutMI.setFlags(X86::IP_HAS_REPEAT); break; } } diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll index b66902f..a4c3457 100644 --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -18,13 +18,13 @@ declare i64 @llvm.ctlz.i64(i64, i1) define i8 @cttz_i8(i8 %x) { ; X86-LABEL: cttz_i8: ; X86: # %bb.0: -; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i8: ; X64: # %bb.0: -; X64-NEXT: bsfl %edi, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -46,12 +46,12 @@ define i8 @cttz_i8(i8 %x) { define i16 @cttz_i16(i16 %x) { ; X86-LABEL: cttz_i16: ; X86: # %bb.0: -; X86-NEXT: bsfw {{[0-9]+}}(%esp), %ax +; X86-NEXT: rep bsfw {{[0-9]+}}(%esp), %ax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i16: ; X64: # %bb.0: -; X64-NEXT: bsfw %di, %ax +; X64-NEXT: rep bsfw %di, %ax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i16: @@ -72,12 +72,12 @@ define i16 @cttz_i16(i16 %x) { define i32 @cttz_i32(i32 %x) { ; X86-LABEL: cttz_i32: ; X86: # %bb.0: -; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; ; X64-LABEL: cttz_i32: ; X64: # %bb.0: -; X64-NEXT: bsfl %edi, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i32: @@ -100,20 +100,20 @@ define i64 @cttz_i64(i64 %x) { ; X86-NOCMOV-NEXT: testl %eax, %eax ; X86-NOCMOV-NEXT: jne .LBB3_1 ; X86-NOCMOV-NEXT: # %bb.2: -; X86-NOCMOV-NEXT: bsfl {{[0-9]+}}(%esp), %eax +; X86-NOCMOV-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax ; X86-NOCMOV-NEXT: addl $32, %eax ; X86-NOCMOV-NEXT: xorl %edx, %edx ; X86-NOCMOV-NEXT: retl ; X86-NOCMOV-NEXT: .LBB3_1: -; X86-NOCMOV-NEXT: bsfl %eax, %eax +; X86-NOCMOV-NEXT: rep bsfl %eax, %eax ; X86-NOCMOV-NEXT: xorl %edx, %edx ; X86-NOCMOV-NEXT: retl ; ; X86-CMOV-LABEL: cttz_i64: ; X86-CMOV: # %bb.0: ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-CMOV-NEXT: bsfl %ecx, %edx -; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %eax +; X86-CMOV-NEXT: rep bsfl %ecx, %edx +; X86-CMOV-NEXT: rep bsfl {{[0-9]+}}(%esp), %eax ; X86-CMOV-NEXT: addl $32, %eax ; X86-CMOV-NEXT: testl %ecx, %ecx ; X86-CMOV-NEXT: cmovnel %edx, %eax @@ -122,7 +122,7 @@ define i64 @cttz_i64(i64 %x) { ; ; X64-LABEL: cttz_i64: ; X64: # %bb.0: -; X64-NEXT: bsfq %rdi, %rax +; X64-NEXT: rep bsfq %rdi, %rax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i64: @@ -517,7 +517,7 @@ define i8 @cttz_i8_zero_test(i8 %n) { ; X86-NEXT: je .LBB12_1 ; X86-NEXT: # %bb.2: # %cond.false ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: bsfl %eax, %eax +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; X86-NEXT: .LBB12_1: @@ -531,7 +531,7 @@ define i8 @cttz_i8_zero_test(i8 %n) { ; X64-NEXT: je .LBB12_1 ; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsfl %eax, %eax +; X64-NEXT: rep bsfl %eax, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; X64-NEXT: .LBB12_1: @@ -565,7 +565,7 @@ define i16 @cttz_i16_zero_test(i16 %n) { ; X86-NEXT: testw %ax, %ax ; X86-NEXT: je .LBB13_1 ; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsfw %ax, %ax +; X86-NEXT: rep bsfw %ax, %ax ; X86-NEXT: retl ; X86-NEXT: .LBB13_1: ; X86-NEXT: movw $16, %ax @@ -576,7 +576,7 @@ define i16 @cttz_i16_zero_test(i16 %n) { ; X64-NEXT: testw %di, %di ; X64-NEXT: je .LBB13_1 ; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsfw %di, %ax +; X64-NEXT: rep bsfw %di, %ax ; X64-NEXT: retq ; X64-NEXT: .LBB13_1: ; X64-NEXT: movw $16, %ax @@ -603,7 +603,7 @@ define i32 @cttz_i32_zero_test(i32 %n) { ; X86-NEXT: testl %eax, %eax ; X86-NEXT: je .LBB14_1 ; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: bsfl %eax, %eax +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: retl ; X86-NEXT: .LBB14_1: ; X86-NEXT: movl $32, %eax @@ -614,7 +614,7 @@ define i32 @cttz_i32_zero_test(i32 %n) { ; X64-NEXT: testl %edi, %edi ; X64-NEXT: je .LBB14_1 ; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsfl %edi, %eax +; X64-NEXT: rep bsfl %edi, %eax ; X64-NEXT: retq ; X64-NEXT: .LBB14_1: ; X64-NEXT: movl $32, %eax @@ -638,7 +638,7 @@ define i64 @cttz_i64_zero_test(i64 %n) { ; X86-NOCMOV-LABEL: cttz_i64_zero_test: ; X86-NOCMOV: # %bb.0: ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOCMOV-NEXT: bsfl {{[0-9]+}}(%esp), %edx +; X86-NOCMOV-NEXT: rep bsfl {{[0-9]+}}(%esp), %edx ; X86-NOCMOV-NEXT: movl $32, %eax ; X86-NOCMOV-NEXT: je .LBB15_2 ; X86-NOCMOV-NEXT: # %bb.1: @@ -651,18 +651,18 @@ define i64 @cttz_i64_zero_test(i64 %n) { ; X86-NOCMOV-NEXT: xorl %edx, %edx ; X86-NOCMOV-NEXT: retl ; X86-NOCMOV-NEXT: .LBB15_3: -; X86-NOCMOV-NEXT: bsfl %ecx, %eax +; X86-NOCMOV-NEXT: rep bsfl %ecx, %eax ; X86-NOCMOV-NEXT: xorl %edx, %edx ; X86-NOCMOV-NEXT: retl ; ; X86-CMOV-LABEL: cttz_i64_zero_test: ; X86-CMOV: # %bb.0: ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CMOV-NEXT: bsfl {{[0-9]+}}(%esp), %ecx +; X86-CMOV-NEXT: rep bsfl {{[0-9]+}}(%esp), %ecx ; X86-CMOV-NEXT: movl $32, %edx ; X86-CMOV-NEXT: cmovnel %ecx, %edx ; X86-CMOV-NEXT: addl $32, %edx -; X86-CMOV-NEXT: bsfl %eax, %eax +; X86-CMOV-NEXT: rep bsfl %eax, %eax ; X86-CMOV-NEXT: cmovel %edx, %eax ; X86-CMOV-NEXT: xorl %edx, %edx ; X86-CMOV-NEXT: retl @@ -672,7 +672,7 @@ define i64 @cttz_i64_zero_test(i64 %n) { ; X64-NEXT: testq %rdi, %rdi ; X64-NEXT: je .LBB15_1 ; X64-NEXT: # %bb.2: # %cond.false -; X64-NEXT: bsfq %rdi, %rax +; X64-NEXT: rep bsfq %rdi, %rax ; X64-NEXT: retq ; X64-NEXT: .LBB15_1: ; X64-NEXT: movl $64, %eax @@ -822,7 +822,7 @@ define i8 @cttz_i8_knownbits(i8 %x) { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orb $2, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: bsfl %eax, %eax +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -830,7 +830,7 @@ define i8 @cttz_i8_knownbits(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: orb $2, %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: bsfl %eax, %eax +; X64-NEXT: rep bsfl %eax, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -983,12 +983,12 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) { ; X86-NOCMOV-NEXT: # %bb.2: ; X86-NOCMOV-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X86-NOCMOV-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NOCMOV-NEXT: bsfl %eax, %eax +; X86-NOCMOV-NEXT: rep bsfl %eax, %eax ; X86-NOCMOV-NEXT: orl $32, %eax ; X86-NOCMOV-NEXT: xorl %edx, %edx ; X86-NOCMOV-NEXT: retl ; X86-NOCMOV-NEXT: .LBB22_1: -; X86-NOCMOV-NEXT: bsfl %eax, %eax +; X86-NOCMOV-NEXT: rep bsfl %eax, %eax ; X86-NOCMOV-NEXT: xorl %edx, %edx ; X86-NOCMOV-NEXT: retl ; @@ -997,8 +997,8 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) { ; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-CMOV-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X86-CMOV-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-CMOV-NEXT: bsfl %ecx, %edx -; X86-CMOV-NEXT: bsfl %eax, %eax +; X86-CMOV-NEXT: rep bsfl %ecx, %edx +; X86-CMOV-NEXT: rep bsfl %eax, %eax ; X86-CMOV-NEXT: orl $32, %eax ; X86-CMOV-NEXT: testl %ecx, %ecx ; X86-CMOV-NEXT: cmovnel %edx, %eax @@ -1009,7 +1009,7 @@ define i64 @cttz_i64_zero_test_knownneverzero(i64 %n) { ; X64: # %bb.0: ; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 ; X64-NEXT: orq %rdi, %rax -; X64-NEXT: bsfq %rax, %rax +; X64-NEXT: rep bsfq %rax, %rax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: cttz_i64_zero_test_knownneverzero: @@ -1110,3 +1110,55 @@ define i32 @PR47603_zext(i32 %a0, ptr %a1) { %sext = sext i8 %load to i32 ret i32 %sext } + +define i32 @cttz_i32_osize(i32 %x) optsize { +; X86-LABEL: cttz_i32_osize: +; X86: # %bb.0: +; X86-NOT: rep +; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: cttz_i32_osize: +; X64: # %bb.0: +; X64-NOT: rep +; X64-NEXT: bsfl %edi, %eax +; X64-NEXT: retq +; +; X86-CLZ-LABEL: cttz_i32_osize: +; X86-CLZ: # %bb.0: +; X86-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax +; X86-CLZ-NEXT: retl +; +; X64-CLZ-LABEL: cttz_i32_osize: +; X64-CLZ: # %bb.0: +; X64-CLZ-NEXT: tzcntl %edi, %eax +; X64-CLZ-NEXT: retq + %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true ) + ret i32 %tmp +} + +define i32 @cttz_i32_msize(i32 %x) minsize { +; X86-LABEL: cttz_i32_msize: +; X86: # %bb.0: +; X86-NOT: rep +; X86-NEXT: bsfl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: cttz_i32_msize: +; X64: # %bb.0: +; X64-NOT: rep +; X64-NEXT: bsfl %edi, %eax +; X64-NEXT: retq +; +; X86-CLZ-LABEL: cttz_i32_msize: +; X86-CLZ: # %bb.0: +; X86-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax +; X86-CLZ-NEXT: retl +; +; X64-CLZ-LABEL: cttz_i32_msize: +; X64-CLZ: # %bb.0: +; X64-CLZ-NEXT: tzcntl %edi, %eax +; X64-CLZ-NEXT: retq + %tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true ) + ret i32 %tmp +} diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll index 9bebec6..dbd9a2c 100644 --- a/llvm/test/CodeGen/X86/dagcombine-select.ll +++ b/llvm/test/CodeGen/X86/dagcombine-select.ll @@ -281,7 +281,7 @@ declare i64 @llvm.cttz.i64(i64, i1) define i64 @cttz_64_eq_select(i64 %v) nounwind { ; NOBMI-LABEL: cttz_64_eq_select: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfq %rdi, %rcx +; NOBMI-NEXT: rep bsfq %rdi, %rcx ; NOBMI-NEXT: movq $-1, %rax ; NOBMI-NEXT: cmovneq %rcx, %rax ; NOBMI-NEXT: addq $6, %rax @@ -305,7 +305,7 @@ define i64 @cttz_64_eq_select(i64 %v) nounwind { define i64 @cttz_64_ne_select(i64 %v) nounwind { ; NOBMI-LABEL: cttz_64_ne_select: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfq %rdi, %rcx +; NOBMI-NEXT: rep bsfq %rdi, %rcx ; NOBMI-NEXT: movq $-1, %rax ; NOBMI-NEXT: cmovneq %rcx, %rax ; NOBMI-NEXT: addq $6, %rax @@ -330,7 +330,7 @@ declare i32 @llvm.cttz.i32(i32, i1) define i32 @cttz_32_eq_select(i32 %v) nounwind { ; NOBMI-LABEL: cttz_32_eq_select: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: rep bsfl %edi, %ecx ; NOBMI-NEXT: movl $-1, %eax ; NOBMI-NEXT: cmovnel %ecx, %eax ; NOBMI-NEXT: addl $6, %eax @@ -354,7 +354,7 @@ define i32 @cttz_32_eq_select(i32 %v) nounwind { define i32 @cttz_32_ne_select(i32 %v) nounwind { ; NOBMI-LABEL: cttz_32_ne_select: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: rep bsfl %edi, %ecx ; NOBMI-NEXT: movl $-1, %eax ; NOBMI-NEXT: cmovnel %ecx, %eax ; NOBMI-NEXT: addl $6, %eax @@ -379,7 +379,7 @@ define i32 @cttz_32_ne_select(i32 %v) nounwind { define i32 @cttz_32_eq_select_ffs(i32 %v) nounwind { ; NOBMI-LABEL: cttz_32_eq_select_ffs: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: rep bsfl %edi, %ecx ; NOBMI-NEXT: movl $-1, %eax ; NOBMI-NEXT: cmovnel %ecx, %eax ; NOBMI-NEXT: incl %eax @@ -403,7 +403,7 @@ define i32 @cttz_32_eq_select_ffs(i32 %v) nounwind { define i32 @cttz_32_ne_select_ffs(i32 %v) nounwind { ; NOBMI-LABEL: cttz_32_ne_select_ffs: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: rep bsfl %edi, %ecx ; NOBMI-NEXT: movl $-1, %eax ; NOBMI-NEXT: cmovnel %ecx, %eax ; NOBMI-NEXT: incl %eax @@ -428,7 +428,7 @@ define i32 @cttz_32_ne_select_ffs(i32 %v) nounwind { define i32 @cttz_32_eq_select_ffs_m1(i32 %v) nounwind { ; NOBMI-LABEL: cttz_32_eq_select_ffs_m1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: rep bsfl %edi, %ecx ; NOBMI-NEXT: movl $-1, %eax ; NOBMI-NEXT: cmovnel %ecx, %eax ; NOBMI-NEXT: retq @@ -449,7 +449,7 @@ define i32 @cttz_32_eq_select_ffs_m1(i32 %v) nounwind { define i32 @cttz_32_ne_select_ffs_m1(i32 %v) nounwind { ; NOBMI-LABEL: cttz_32_ne_select_ffs_m1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: bsfl %edi, %ecx +; NOBMI-NEXT: rep bsfl %edi, %ecx ; NOBMI-NEXT: movl $-1, %eax ; NOBMI-NEXT: cmovnel %ecx, %eax ; NOBMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll index f3d4b62..9069688 100644 --- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll +++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll @@ -353,6 +353,7 @@ define i1 @asm_clobbering_flags(ptr %mem) nounwind { ; CHECK32-NEXT: testl %edx, %edx ; CHECK32-NEXT: setg %al ; CHECK32-NEXT: #APP +; CHECK32-NOT: rep ; CHECK32-NEXT: bsfl %edx, %edx ; CHECK32-NEXT: #NO_APP ; CHECK32-NEXT: movl %edx, (%ecx) @@ -364,6 +365,7 @@ define i1 @asm_clobbering_flags(ptr %mem) nounwind { ; CHECK64-NEXT: testl %ecx, %ecx ; CHECK64-NEXT: setg %al ; CHECK64-NEXT: #APP +; CHECK64-NOT: rep ; CHECK64-NEXT: bsfl %ecx, %ecx ; CHECK64-NEXT: #NO_APP ; CHECK64-NEXT: movl %ecx, (%rdi) diff --git a/llvm/test/CodeGen/X86/stack-folding-x86_64.ll b/llvm/test/CodeGen/X86/stack-folding-x86_64.ll index e6fbec1..c505f4b 100644 --- a/llvm/test/CodeGen/X86/stack-folding-x86_64.ll +++ b/llvm/test/CodeGen/X86/stack-folding-x86_64.ll @@ -37,7 +37,7 @@ define i32 @stack_fold_bsf_i32(i32 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: bsfl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload +; CHECK-NEXT: rep bsfl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -82,7 +82,7 @@ define i64 @stack_fold_bsf_i64(i64 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: bsfq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; CHECK-NEXT: rep bsfq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 -- 2.7.4