From bcc4c909544e136ac7198d0acf6768817b702bed Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 17 Oct 2022 16:15:59 -0700 Subject: [PATCH] [BOLT] Fix instruction encoding validation Always use non-symbolizing disassembler for instruction encoding validation as symbols will be treated as undefined/zeros be the encoder and causing byte sequence mismatches. Reviewed By: Amir Differential Revision: https://reviews.llvm.org/D136118 --- bolt/include/bolt/Core/BinaryContext.h | 8 ++++---- bolt/lib/Core/BinaryContext.cpp | 18 ++++++++++++------ bolt/lib/Core/BinaryFunction.cpp | 11 +++-------- bolt/test/X86/encoding-validation.s | 31 +++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 18 deletions(-) create mode 100644 bolt/test/X86/encoding-validation.s diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 3c99426..84e301a 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -1236,10 +1236,10 @@ public: return Size; } - /// Verify that assembling instruction \p Inst results in the same sequence of - /// bytes as \p Encoding. - bool validateEncoding(const MCInst &Instruction, - ArrayRef Encoding) const; + /// Validate that disassembling the \p Sequence of bytes into an instruction + /// and assembling the instruction again, results in a byte sequence identical + /// to the original one. + bool validateInstructionEncoding(ArrayRef Sequence) const; /// Return a function execution count threshold for determining whether /// the function is 'hot'. Consider it hot if count is above the average exec diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index f3f55b9..6a01ab4 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -2287,19 +2287,25 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) { return std::make_pair(HotSize, ColdSize); } -bool BinaryContext::validateEncoding(const MCInst &Inst, - ArrayRef InputEncoding) const { +bool BinaryContext::validateInstructionEncoding( + ArrayRef InputSequence) const { + MCInst Inst; + uint64_t InstSize; + DisAsm->getInstruction(Inst, InstSize, InputSequence, 0, nulls()); + assert(InstSize == InputSequence.size() && + "Disassembled instruction size does not match the sequence."); + SmallString<256> Code; SmallVector Fixups; raw_svector_ostream VecOS(Code); MCE->encodeInstruction(Inst, VecOS, Fixups, *STI); - auto EncodedData = ArrayRef((uint8_t *)Code.data(), Code.size()); - if (InputEncoding != EncodedData) { + auto OutputSequence = ArrayRef((uint8_t *)Code.data(), Code.size()); + if (InputSequence != OutputSequence) { if (opts::Verbosity > 1) { errs() << "BOLT-WARNING: mismatched encoding detected\n" - << " input: " << InputEncoding << '\n' - << " output: " << EncodedData << '\n'; + << " input: " << InputSequence << '\n' + << " output: " << OutputSequence << '\n'; } return false; } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 88d9ace..b91cac7 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1217,7 +1217,7 @@ bool BinaryFunction::disassemble() { // Check integrity of LLVM assembler/disassembler. if (opts::CheckEncoding && !BC.MIB->isBranch(Instruction) && !BC.MIB->isCall(Instruction) && !BC.MIB->isNoop(Instruction)) { - if (!BC.validateEncoding(Instruction, FunctionData.slice(Offset, Size))) { + if (!BC.validateInstructionEncoding(FunctionData.slice(Offset, Size))) { errs() << "BOLT-WARNING: mismatching LLVM encoding detected in " << "function " << *this << " for instruction :\n"; BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr); @@ -1233,15 +1233,10 @@ bool BinaryFunction::disassemble() { break; } - // Disassemble again without the symbolizer and check that the disassembly - // matches the assembler output. - MCInst TempInst; - BC.DisAsm->getInstruction(TempInst, Size, FunctionData.slice(Offset), - AbsoluteInstrAddr, nulls()); - if (!BC.validateEncoding(TempInst, FunctionData.slice(Offset, Size))) { + if (!BC.validateInstructionEncoding(FunctionData.slice(Offset, Size))) { errs() << "BOLT-WARNING: internal assembler/disassembler error " "detected for AVX512 instruction:\n"; - BC.printInstruction(errs(), TempInst, AbsoluteInstrAddr); + BC.printInstruction(errs(), Instruction, AbsoluteInstrAddr); errs() << " in function " << *this << '\n'; setIgnored(); break; diff --git a/bolt/test/X86/encoding-validation.s b/bolt/test/X86/encoding-validation.s new file mode 100644 index 0000000..0c716ab --- /dev/null +++ b/bolt/test/X86/encoding-validation.s @@ -0,0 +1,31 @@ +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t.exe -q +# RUN: llvm-bolt %t.exe --relocs -o %t.out --check-encoding |& FileCheck %s + + .text + .globl _start + .type _start, %function +_start: + .cfi_startproc + +## Check that llvm-bolt uses non-symbolizing disassembler while validating +## instruction encodings. If symbol "foo" below is symbolized, the encoded +## instruction would have a different sequence of bytes from the input +## sequence, as "foo" will not have any address assigned at that point. + + movq foo(%rip), %rax +# CHECK-NOT: mismatching LLVM encoding detected + + ret + .cfi_endproc + .size _start, .-_start + + .globl foo + .type foo, %function +foo: + .cfi_startproc + ret + .cfi_endproc + .size foo, .-foo -- 2.7.4