[MC,llvm-objdump,ARM] Target-dependent disassembly resync policy.

author Simon Tatham <simon.tatham@arm.com>

Tue, 26 Jul 2022 08:20:41 +0000 (09:20 +0100)

committer Simon Tatham <simon.tatham@arm.com>

Tue, 26 Jul 2022 08:35:30 +0000 (09:35 +0100)
author Simon Tatham <simon.tatham@arm.com>
Tue, 26 Jul 2022 08:20:41 +0000 (09:20 +0100)
committer Simon Tatham <simon.tatham@arm.com>
Tue, 26 Jul 2022 08:35:30 +0000 (09:35 +0100)
diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h

index de069ff..ebe81fd 100644 (file)
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -171,6 +171,29 @@ public:
    // It should help move much of the target specific code from llvm-objdump to
    // respective target disassemblers.
  
+  /// Suggest a distance to skip in a buffer of data to find the next
+  /// place to look for the start of an instruction. For example, if
+  /// all instructions have a fixed alignment, this might advance to
+  /// the next multiple of that alignment.
+  ///
+  /// If not overridden, the default is 1.
+  ///
+  /// \param Address  - The address, in the memory space of region, of the
+  ///                   starting point (typically the first byte of something
+  ///                   that did not decode as a valid instruction at all).
+  /// \param Bytes    - A reference to the actual bytes at Address. May be
+  ///                   needed in order to determine the width of an
+  ///                   unrecognized instruction (e.g. in Thumb this is a simple
+  ///                   consistent criterion that doesn't require knowing the
+  ///                   specific instruction). The caller can pass as much data
+  ///                   as they have available, and the function is required to
+  ///                   make a reasonable default choice if not enough data is
+  ///                   available to make a better one.
+  /// \return         - A number of bytes to skip. Must always be greater than
+  ///                   zero. May be greater than the size of Bytes.
+  virtual uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+                                      uint64_t Address) const;
+
  private:
    MCContext &Ctx;
  
diff --git a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp

index cf98cb8..3ee4339 100644 (file)
--- a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
@@ -20,6 +20,11 @@ MCDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
    return None;
  }
  
+uint64_t MCDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+                                            uint64_t Address) const {
+  return 1;
+}
+
  bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
                                                uint64_t Address, bool IsBranch,
                                                uint64_t Offset, uint64_t OpSize,
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp

index 1b65589..2f20232 100644 (file)
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -350,6 +350,14 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
    return MCDisassembler::Fail;
  }
  
+uint64_t AArch64Disassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address) const {
+  // AArch64 instructions are always 4 bytes wide, so there's no point
+  // in skipping any smaller number of bytes if an instruction can't
+  // be decoded.
+  return 4;
+}
+
  static MCSymbolizer *
  createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                  LLVMSymbolLookupCallback SymbolLookUp,
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h

index 6761d44..b9f7854 100644 (file)
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -30,6 +30,9 @@ public:
    MCDisassembler::DecodeStatus
    getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
                   uint64_t Address, raw_ostream &CStream) const override;
+
+  uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+                              uint64_t Address) const override;
  };
  
  } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp

index 9acd492..f814959 100644 (file)
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -139,6 +139,9 @@ public:
                                ArrayRef<uint8_t> Bytes, uint64_t Address,
                                raw_ostream &CStream) const override;
  
+  uint64_t suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+                              uint64_t Address) const override;
+
  private:
    DecodeStatus getARMInstruction(MCInst &Instr, uint64_t &Size,
                                   ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -739,6 +742,33 @@ static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
    }
  }
  
+uint64_t ARMDisassembler::suggestBytesToSkip(ArrayRef<uint8_t> Bytes,
+                                             uint64_t Address) const {
+  // In Arm state, instructions are always 4 bytes wide, so there's no
+  // point in skipping any smaller number of bytes if an instruction
+  // can't be decoded.
+  if (!STI.getFeatureBits()[ARM::ModeThumb])
+    return 4;
+
+  // In a Thumb instruction stream, a halfword is a standalone 2-byte
+  // instruction if and only if its value is less than 0xE800.
+  // Otherwise, it's the first halfword of a 4-byte instruction.
+  //
+  // So, if we can see the upcoming halfword, we can judge on that
+  // basis, and maybe skip a whole 4-byte instruction that we don't
+  // know how to decode, without accidentally trying to interpret its
+  // second half as something else.
+  //
+  // If we don't have the instruction data available, we just have to
+  // recommend skipping the minimum sensible distance, which is 2
+  // bytes.
+  if (Bytes.size() < 2)
+    return 2;
+
+  uint16_t Insn16 = (Bytes[1] << 8) | Bytes[0];
+  return Insn16 < 0xE800 ? 2 : 4;
+}
+
  DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                               ArrayRef<uint8_t> Bytes,
                                               uint64_t Address,
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr-resync.test b/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr-resync.test

new file mode 100644 (file)

index 0000000..5ebcd79
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr-resync.test
@@ -0,0 +1,52 @@
+# RUN: yaml2obj %s | llvm-objdump -d --mcpu=cortex-a8 - | FileCheck %s
+
+# Test that unrecognized instructions are skipped in a way that makes
+# sense for the Arm instruction set encoding.
+#
+# The first three instructions in this file are marked by the mapping
+# symbols as in Arm state, with the one in the middle unknown, and we
+# expect the disassembler to skip 4 bytes because that's the width of
+# any Arm instruction.
+#
+# At address 0xc there's a mapping symbol that says we're now in Thumb
+# mode, and in that mode we include both a 16-bit and a 32-bit unknown
+# Thumb instruction, which the disassembler will identify by the simple
+# encoding criterion that tells you the instruction length without
+# having to recognize it specifically.
+#
+# Finally we end with a single byte, to ensure nothing gets confused
+# when the Thumb instruction stream doesn't contain enough data to
+# even do that check.
+
+# CHECK:       0: 64 00 a0 e3   mov     r0, #100
+# CHECK-NEXT:  4: ff ff ff ff   <unknown>
+# CHECK-NEXT:  8: 12 03 81 e0   add     r0, r1, r2, lsl r3
+
+# CHECK:       c: 64 20         movs    r0, #100
+# CHECK-NEXT:  e: 0e b8         <unknown>
+# CHECK-NEXT: 10: 40 18         adds    r0, r0, r1
+# CHECK-NEXT: 12: 4f f0 64 00   mov.w   r0, #100
+# CHECK-NEXT: 16: ee ff cc dd   <unknown>
+# CHECK-NEXT: 1a: 01 eb c2 00   add.w   r0, r1, r2, lsl #3
+# CHECK-NEXT: 1e: 9a            <unknown>
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS32
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_ARM
+  Flags:           [ EF_ARM_EABI_VER5 ]
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x4
+    Content:         6400a0e3ffffffff120381e064200eb840184ff06400eeffccdd01ebc2009a
+Symbols:
+  - Name:            '$a'
+    Section:         .text
+  - Name:            '$t'
+    Section:         .text
+    Value:           0x0c
+...
diff --git a/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test b/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test

index 77972f9..ee31997 100644 (file)
--- a/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test
+++ b/llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test
@@ -13,8 +13,8 @@
  ##   llvm-objdump --mattr=+ext1
  
  # CHECK:        00000000 <.text>:
-# CHECK-NEXT:       0: cb              <unknown>
-# CHECK-NEXT:       1: f3 f7 8b be     b.w 0xffff3d1b <{{.+}}> @ imm = #-49898
+# CHECK-NEXT:       0: cb f3 f7 8b   <unknown>
+# CHECK-NEXT:       4: be            <unknown>
  
  --- !ELF
  FileHeader:
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp

index c486088..5feebc5 100644 (file)
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -1022,10 +1022,12 @@ static void collectLocalBranchTargets(
      // Disassemble a real instruction and record function-local branch labels.
      MCInst Inst;
      uint64_t Size;
-    bool Disassembled = DisAsm->getInstruction(
-        Inst, Size, Bytes.slice(Index - SectionAddr), Index, nulls());
+    ArrayRef<uint8_t> ThisBytes = Bytes.slice(Index - SectionAddr);
+    bool Disassembled =
+        DisAsm->getInstruction(Inst, Size, ThisBytes, Index, nulls());
      if (Size == 0)
-      Size = 1;
+      Size = std::min(ThisBytes.size(),
+                      DisAsm->suggestBytesToSkip(ThisBytes, Index));
  
      if (Disassembled && MIA) {
        uint64_t Target;
@@ -1068,10 +1070,11 @@ static void addSymbolizer(
    for (size_t Index = 0; Index != Bytes.size();) {
      MCInst Inst;
      uint64_t Size;
-    DisAsm->getInstruction(Inst, Size, Bytes.slice(Index), SectionAddr + Index,
-                           nulls());
+    ArrayRef<uint8_t> ThisBytes = Bytes.slice(Index - SectionAddr);
+    DisAsm->getInstruction(Inst, Size, ThisBytes, Index, nulls());
      if (Size == 0)
-      Size = 1;
+      Size = std::min(ThisBytes.size(),
+                      DisAsm->suggestBytesToSkip(ThisBytes, Index));
      Index += Size;
    }
    ArrayRef<uint64_t> LabelAddrsRef = SymbolizerPtr->getReferencedAddresses();
@@ -1538,11 +1541,13 @@ static void disassembleObject(const Target *TheTarget, ObjectFile &Obj,
            // Disassemble a real instruction or a data when disassemble all is
            // provided
            MCInst Inst;
-          bool Disassembled =
-              DisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
-                                     SectionAddr + Index, CommentStream);
+          ArrayRef<uint8_t> ThisBytes = Bytes.slice(Index);
+          uint64_t ThisAddr = SectionAddr + Index;
+          bool Disassembled = DisAsm->getInstruction(Inst, Size, ThisBytes,
+                                                     ThisAddr, CommentStream);
            if (Size == 0)
-            Size = 1;
+            Size = std::min(ThisBytes.size(),
+                            DisAsm->suggestBytesToSkip(ThisBytes, ThisAddr));
  
            LVP.update({Index, Section.getIndex()},
                       {Index + Size, Section.getIndex()}, Index + Size != End);
diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp

index 0336b61..e8d271a 100644 (file)
--- a/llvm/tools/sancov/sancov.cpp
+++ b/llvm/tools/sancov/sancov.cpp
@@ -783,10 +783,12 @@ static void getObjectCoveragePoints(const object::ObjectFile &O,
      for (uint64_t Index = 0, Size = 0; Index < Section.getSize();
           Index += Size) {
        MCInst Inst;
-      if (!DisAsm->getInstruction(Inst, Size, Bytes.slice(Index),
-                                  SectionAddr + Index, nulls())) {
+      ArrayRef<uint8_t> ThisBytes = Bytes.slice(Index);
+      uint64_t ThisAddr = SectionAddr + Index;
+      if (!DisAsm->getInstruction(Inst, Size, ThisBytes, ThisAddr, nulls())) {
          if (Size == 0)
-          Size = 1;
+          Size = std::min(ThisBytes.size(),
+                          DisAsm->suggestBytesToSkip(ThisBytes, ThisAddr));
          continue;
        }
        uint64_t Addr = Index + SectionAddr;
author	Simon Tatham <simon.tatham@arm.com>
	Tue, 26 Jul 2022 08:20:41 +0000 (09:20 +0100)
committer	Simon Tatham <simon.tatham@arm.com>
	Tue, 26 Jul 2022 08:35:30 +0000 (09:35 +0100)
llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h		patch \| blob \| history
llvm/lib/MC/MCDisassembler/MCDisassembler.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp		patch \| blob \| history
llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h		patch \| blob \| history
llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp		patch \| blob \| history
llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr-resync.test	[new file with mode: 0644]	patch \| blob
llvm/test/tools/llvm-objdump/ELF/ARM/unknown-instr.test		patch \| blob \| history
llvm/tools/llvm-objdump/llvm-objdump.cpp		patch \| blob \| history
llvm/tools/sancov/sancov.cpp		patch \| blob \| history