// It should help move much of the target specific code from llvm-objdump to
// respective target disassemblers.
+ /// Suggest a distance to skip in a buffer of data to find the next
+ /// place to look for the start of an instruction. For example, if
+ /// all instructions have a fixed alignment, this might advance to
+ /// the next multiple of that alignment.
+ ///
+ /// If not overridden, the default is 1.
+ ///
+ /// \param Address - The address, in the memory space of region, of the
+ /// starting point (typically the first byte of something
+ /// that did not decode as a valid instruction at all).
+ /// \param Bytes - A reference to the actual bytes at Address. May be
+ /// needed in order to determine the width of an
+ /// unrecognized instruction (e.g. in Thumb this is a simple
+ /// consistent criterion that doesn't require knowing the
+ /// specific instruction). The caller can pass as much data
+ /// as they have available, and the function is required to
+ /// make a reasonable default choice if not enough data is
+ /// available to make a better one.
+ /// \return - A number of bytes to skip. Must always be greater than
+ /// zero. May be greater than the size of Bytes.
+ virtual uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const;
+
private:
MCContext &Ctx;
return None;
}
+uint64_t MCDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const {
+ return 1;
+}
+
bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
uint64_t Address, bool IsBranch,
uint64_t Offset, uint64_t OpSize,
return MCDisassembler::Fail;
}
+uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const {
+ // AArch64 instructions are always 4 bytes wide, so there's no point
+ // in skipping any smaller number of bytes if an instruction can't
+ // be decoded.
+ return 4;
+}
+
static MCSymbolizer *
createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
LLVMSymbolLookupCallback SymbolLookUp,
MCDisassembler::DecodeStatus
getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
uint64_t Address, raw_ostream &CStream) const override;
+
+ uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const override;
};
} // end namespace llvm
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &CStream) const override;
+ uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const override;
+
private:
DecodeStatus getARMInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
}
}
+uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+ uint64_t Address) const {
+ // In Arm state, instructions are always 4 bytes wide, so there's no
+ // point in skipping any smaller number of bytes if an instruction
+ // can't be decoded.
+ if (!STI.getFeatureBits()[ARM::ModeThumb])
+ return 4;
+
+ // In a Thumb instruction stream, a halfword is a standalone 2-byte
+ // instruction if and only if its value is less than 0xE800.
+ // Otherwise, it's the first halfword of a 4-byte instruction.
+ //
+ // So, if we can see the upcoming halfword, we can judge on that
+ // basis, and maybe skip a whole 4-byte instruction that we don't
+ // know how to decode, without accidentally trying to interpret its
+ // second half as something else.
+ //
+ // If we don't have the instruction data available, we just have to
+ // recommend skipping the minimum sensible distance, which is 2
+ // bytes.
+ if (Bytes.size() < 2)
+ return 2;
+
+ uint16_t Insn16 = (Bytes[1] << 8) | Bytes[0];
+ return Insn16 < 0xE800 ? 2 : 4;
+}
+
DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes,
uint64_t Address,
--- /dev/null
+# RUN: yaml2obj %s | llvm-objdump -d --mcpu=cortex-a8 - | FileCheck %s
+
+# Test that unrecognized instructions are skipped in a way that makes
+# sense for the Arm instruction set encoding.
+#
+# The first three instructions in this file are marked by the mapping
+# symbols as in Arm state, with the one in the middle unknown, and we
+# expect the disassembler to skip 4 bytes because that's the width of
+# any Arm instruction.
+#
+# At address 0xc there's a mapping symbol that says we're now in Thumb
+# mode, and in that mode we include both a 16-bit and a 32-bit unknown
+# Thumb instruction, which the disassembler will identify by the simple
+# encoding criterion that tells you the instruction length without
+# having to recognize it specifically.
+#
+# Finally we end with a single byte, to ensure nothing gets confused
+# when the Thumb instruction stream doesn't contain enough data to
+# even do that check.
+
+# CHECK: 0: 64 00 a0 e3 mov r0, #100
+# CHECK-NEXT: 4: ff ff ff ff <unknown>
+# CHECK-NEXT: 8: 12 03 81 e0 add r0, r1, r2, lsl r3
+
+# CHECK: c: 64 20 movs r0, #100
+# CHECK-NEXT: e: 0e b8 <unknown>
+# CHECK-NEXT: 10: 40 18 adds r0, r0, r1
+# CHECK-NEXT: 12: 4f f0 64 00 mov.w r0, #100
+# CHECK-NEXT: 16: ee ff cc dd <unknown>
+# CHECK-NEXT: 1a: 01 eb c2 00 add.w r0, r1, r2, lsl #3
+# CHECK-NEXT: 1e: 9a <unknown>
+
+--- !ELF
+FileHeader:
+ Class: ELFCLASS32
+ Data: ELFDATA2LSB
+ Type: ET_REL
+ Machine: EM_ARM
+ Flags: [ EF_ARM_EABI_VER5 ]
+Sections:
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ AddressAlign: 0x4
+ Content: 6400a0e3ffffffff120381e064200eb840184ff06400eeffccdd01ebc2009a
+Symbols:
+ - Name: '$a'
+ Section: .text
+ - Name: '$t'
+ Section: .text
+ Value: 0x0c
+...
## llvm-objdump --mattr=+ext1
# CHECK: 00000000 <.text>:
-# CHECK-NEXT: 0: cb <unknown>
-# CHECK-NEXT: 1: f3 f7 8b be b.w 0xffff3d1b <{{.+}}> @ imm = #-49898
+# CHECK-NEXT: 0: cb f3 f7 8b <unknown>
+# CHECK-NEXT: 4: be <unknown>
--- !ELF
FileHeader:
// Disassemble a real instruction and record function-local branch labels.
MCInst Inst;
uint64_t Size;
- bool Disassembled = DisAsm->getInstruction(
- Inst, Size, Bytes.slice(Index - SectionAddr), Index, nulls());
+ ArrayRef<uint8_t> ThisBytes = Bytes.slice(Index - SectionAddr);
+ bool Disassembled =
+ DisAsm->getInstruction(Inst, Size, ThisBytes, Index, nulls());
if (Size == 0)
- Size = 1;
+ Size = std::min(ThisBytes.size(),
+ DisAsm->suggestBytesToSkip(ThisBytes, Index));
if (Disassembled && MIA) {
uint64_t Target;
for (size_t Index = 0; Index != Bytes.size();) {
MCInst Inst;
uint64_t Size;
- DisAsm->getInstruction(Inst, Size, Bytes.slice(Index), SectionAddr + Index,
- nulls());
+ ArrayRef<uint8_t> ThisBytes = Bytes.slice(Index - SectionAddr);
+ DisAsm->getInstruction(Inst, Size, ThisBytes, Index, nulls());
if (Size == 0)
- Size = 1;
+ Size = std::min(ThisBytes.size(),
+ DisAsm->suggestBytesToSkip(ThisBytes, Index));
Index += Size;
}
ArrayRef<uint64_t> LabelAddrsRef = SymbolizerPtr->getReferencedAddresses();
// Disassemble a real instruction or a data when disassemble all is
// provided
MCInst Inst;
- bool Disassembled =
- DisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
- SectionAddr + Index, CommentStream);
+ ArrayRef<uint8_t> ThisBytes = Bytes.slice(Index);
+ uint64_t ThisAddr = SectionAddr + Index;
+ bool Disassembled = DisAsm->getInstruction(Inst, Size, ThisBytes,
+ ThisAddr, CommentStream);
if (Size == 0)
- Size = 1;
+ Size = std::min(ThisBytes.size(),
+ DisAsm->suggestBytesToSkip(ThisBytes, ThisAddr));
LVP.update({Index, Section.getIndex()},
{Index + Size, Section.getIndex()}, Index + Size != End);
for (uint64_t Index = 0, Size = 0; Index < Section.getSize();
Index += Size) {
MCInst Inst;
- if (!DisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
- SectionAddr + Index, nulls())) {
+ ArrayRef<uint8_t> ThisBytes = Bytes.slice(Index);
+ uint64_t ThisAddr = SectionAddr + Index;
+ if (!DisAsm->getInstruction(Inst, Size, ThisBytes, ThisAddr, nulls())) {
if (Size == 0)
- Size = 1;
+ Size = std::min(ThisBytes.size(),
+ DisAsm->suggestBytesToSkip(ThisBytes, ThisAddr));
continue;
}
uint64_t Addr = Index + SectionAddr;