[WebAssembly] Initial Disassembler.
authorSam Clegg <sbc@chromium.org>
Thu, 10 May 2018 22:16:44 +0000 (22:16 +0000)
committerSam Clegg <sbc@chromium.org>
Thu, 10 May 2018 22:16:44 +0000 (22:16 +0000)
This implements a new table-gen emitter to create tables for
a wasm disassembler, and a dissassembler to use them.

Comes with 2 tests, that tests a few instructions manually. Is also able to
disassemble large .wasm files with objdump reasonably.

Not working so well, to be addressed in followups:
- objdump appears to be passing an incorrect starting point.
- since the disassembler works an instruction at a time, and it is
  disassembling stack instruction, it has no idea of pseudo register assignments.
  These registers are required for the instruction printing code that follows.
  For now, all such registers appear in the output as $0.

Patch by Wouter van Oortmerssen

Differential Revision: https://reviews.llvm.org/D45848

llvm-svn: 332052

12 files changed:
llvm/lib/Target/WebAssembly/CMakeLists.txt
llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
llvm/lib/Target/WebAssembly/WebAssembly.td
llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
llvm/test/MC/Disassembler/WebAssembly/lit.local.cfg [new file with mode: 0644]
llvm/test/MC/Disassembler/WebAssembly/wasm.txt [new file with mode: 0644]
llvm/unittests/MC/Disassembler.cpp
llvm/utils/TableGen/CMakeLists.txt
llvm/utils/TableGen/DisassemblerEmitter.cpp
llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp [new file with mode: 0644]
llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h [new file with mode: 0644]

index c761867..41b7694 100644 (file)
@@ -3,6 +3,7 @@ set(LLVM_TARGET_DEFINITIONS WebAssembly.td)
 tablegen(LLVM WebAssemblyGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM WebAssemblyGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM WebAssemblyGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM WebAssemblyGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
index f75832f..2f09602 100644 (file)
 #include "WebAssembly.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/TargetRegistry.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-disassembler"
 
+using DecodeStatus = MCDisassembler::DecodeStatus;
+
+#include "WebAssemblyGenDisassemblerTables.inc"
+
 namespace {
 class WebAssemblyDisassembler final : public MCDisassembler {
   std::unique_ptr<const MCInstrInfo> MCII;
@@ -60,11 +67,120 @@ extern "C" void LLVMInitializeWebAssemblyDisassembler() {
                                          createWebAssemblyDisassembler);
 }
 
-MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
-    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
-    raw_ostream &OS, raw_ostream &CS) const {
+static int nextByte(ArrayRef<uint8_t> Bytes, uint64_t &Size) {
+  if (Size >= Bytes.size())
+    return -1;
+  auto V = Bytes[Size];
+  Size++;
+  return V;
+}
 
-  // TODO: Implement disassembly.
+static bool parseLEBImmediate(MCInst &MI, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, bool Signed) {
+  unsigned N = 0;
+  const char *Error = nullptr;
+  auto Val = Signed ? decodeSLEB128(Bytes.data() + Size, &N,
+                                    Bytes.data() + Bytes.size(), &Error)
+                    : static_cast<int64_t>(
+                          decodeULEB128(Bytes.data() + Size, &N,
+                                        Bytes.data() + Bytes.size(), &Error));
+  if (Error)
+    return false;
+  Size += N;
+  MI.addOperand(MCOperand::createImm(Val));
+  return true;
+}
+
+template <typename T>
+bool parseFPImmediate(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes) {
+  if (Size + sizeof(T) > Bytes.size())
+    return false;
+  T Val;
+  memcpy(&Val, Bytes.data() + Size, sizeof(T));
+  support::endian::byte_swap<T, support::endianness::little>(Val);
+  Size += sizeof(T);
+  MI.addOperand(MCOperand::createFPImm(static_cast<double>(Val)));
+  return true;
+}
 
-  return MCDisassembler::Fail;
+MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
+    MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
+    raw_ostream & /*OS*/, raw_ostream &CS) const {
+  CommentStream = &CS;
+  Size = 0;
+  auto Opc = nextByte(Bytes, Size);
+  if (Opc < 0)
+    return MCDisassembler::Fail;
+  const auto *WasmInst = &InstructionTable0[Opc];
+  // If this is a prefix byte, indirect to another table.
+  if (WasmInst->ET == ET_Prefix) {
+    WasmInst = nullptr;
+    // Linear search, so far only 2 entries.
+    for (auto PT = PrefixTable; PT->Table; PT++) {
+      if (PT->Prefix == Opc) {
+        WasmInst = PT->Table;
+        break;
+      }
+    }
+    if (!WasmInst)
+      return MCDisassembler::Fail;
+    Opc = nextByte(Bytes, Size);
+    if (Opc < 0)
+      return MCDisassembler::Fail;
+    WasmInst += Opc;
+  }
+  if (WasmInst->ET == ET_Unused)
+    return MCDisassembler::Fail;
+  // At this point we must have a valid instruction to decode.
+  assert(WasmInst->ET == ET_Instruction);
+  MI.setOpcode(WasmInst->Opcode);
+  // Parse any operands.
+  for (uint8_t OPI = 0; OPI < WasmInst->NumOperands; OPI++) {
+    switch (WasmInst->Operands[OPI]) {
+    // ULEB operands:
+    case WebAssembly::OPERAND_BASIC_BLOCK:
+    case WebAssembly::OPERAND_LOCAL:
+    case WebAssembly::OPERAND_GLOBAL:
+    case WebAssembly::OPERAND_FUNCTION32:
+    case WebAssembly::OPERAND_OFFSET32:
+    case WebAssembly::OPERAND_P2ALIGN:
+    case WebAssembly::OPERAND_TYPEINDEX:
+    case MCOI::OPERAND_IMMEDIATE: {
+      if (!parseLEBImmediate(MI, Size, Bytes, false))
+        return MCDisassembler::Fail;
+      break;
+    }
+    // SLEB operands:
+    case WebAssembly::OPERAND_I32IMM:
+    case WebAssembly::OPERAND_I64IMM:
+    case WebAssembly::OPERAND_SIGNATURE: {
+      if (!parseLEBImmediate(MI, Size, Bytes, true))
+        return MCDisassembler::Fail;
+      break;
+    }
+    // FP operands.
+    case WebAssembly::OPERAND_F32IMM: {
+      if (!parseFPImmediate<float>(MI, Size, Bytes))
+        return MCDisassembler::Fail;
+      break;
+    }
+    case WebAssembly::OPERAND_F64IMM: {
+      if (!parseFPImmediate<double>(MI, Size, Bytes))
+        return MCDisassembler::Fail;
+      break;
+    }
+    case MCOI::OPERAND_REGISTER: {
+      // These are NOT actually in the instruction stream, but MC is going to
+      // expect operands to be present for them!
+      // FIXME: can MC re-generate register assignments or do we have to
+      // do this? Since this function decodes a single instruction, we don't
+      // have the proper context for tracking an operand stack here.
+      MI.addOperand(MCOperand::createReg(0));
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown operand type in WebAssemblyDisassembler");
+    }
+  }
+  return MCDisassembler::Success;
 }
index 6629ce6..10fa798 100644 (file)
@@ -46,7 +46,7 @@ void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
 
 void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                        StringRef Annot,
-                                       const MCSubtargetInfo & /*STI*/) {
+                                       const MCSubtargetInfo &STI) {
   // Print the instruction (this uses the AsmStrings from the .td files).
   printInstruction(MI, OS);
 
@@ -194,20 +194,16 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void
-WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(const MCInst *MI,
-                                                       unsigned OpNo,
-                                                       raw_ostream &O) {
+void WebAssemblyInstPrinter::printWebAssemblyP2AlignOperand(
+    const MCInst *MI, unsigned OpNo, raw_ostream &O) {
   int64_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm == WebAssembly::GetDefaultP2Align(MI->getOpcode()))
     return;
   O << ":p2align=" << Imm;
 }
 
-void
-WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI,
-                                                         unsigned OpNo,
-                                                         raw_ostream &O) {
+void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(
+    const MCInst *MI, unsigned OpNo, raw_ostream &O) {
   int64_t Imm = MI->getOperand(OpNo).getImm();
   switch (WebAssembly::ExprType(Imm)) {
   case WebAssembly::ExprType::Void: break;
index ad1549f..2f301da 100644 (file)
@@ -82,7 +82,15 @@ def WebAssemblyAsmParser : AsmParser {
   let ShouldEmitMatchRegisterName = 0;
 }
 
+def WebAssemblyAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int PassSubtarget = 0;
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+
 def WebAssembly : Target {
   let InstructionSet = WebAssemblyInstrInfo;
   let AssemblyParsers  = [WebAssemblyAsmParser];
+  let AssemblyWriters = [WebAssemblyAsmWriter];
 }
index c667f93..d59aba4 100644 (file)
@@ -57,6 +57,10 @@ def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
 }
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
+// This is technically a control-flow instruction, since all it affects is the
+// IP.
+def NOP : I<(outs), (ins), [], "nop", 0x01>;
+
 // Placemarkers to indicate the start or end of a block or loop scope.
 // These use/clobber VALUE_STACK to prevent them from being moved into the
 // middle of an expression tree.
diff --git a/llvm/test/MC/Disassembler/WebAssembly/lit.local.cfg b/llvm/test/MC/Disassembler/WebAssembly/lit.local.cfg
new file mode 100644 (file)
index 0000000..2b7fce1
--- /dev/null
@@ -0,0 +1,3 @@
+if not 'WebAssembly' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/MC/Disassembler/WebAssembly/wasm.txt b/llvm/test/MC/Disassembler/WebAssembly/wasm.txt
new file mode 100644 (file)
index 0000000..d6229f6
--- /dev/null
@@ -0,0 +1,33 @@
+# RUN: llvm-mc --disassemble %s -triple=wasm32-unknown-unknown | FileCheck %s
+
+# CHECK: .text
+
+# CHECK: nop
+0x01
+
+# CHECK: i32.add $0=, $0, $0
+# NOTE: registers are meaningless, as there is no context for what they are.
+0x6a
+
+# CHECK: i64.const $0=, -1
+0x42 0x7F
+
+# CHECK: i64.load32_u $0=, 16($0):p2align=1
+0x35 0x01 0x10
+
+# CHECK: block
+# 3
+# FIXME: WebAssemblyInstPrinter does not currently print block number.
+0x02 0x03
+
+# CHECK: call_indirect
+# $0=, 128, 0
+# FIXME: WebAssemblyInstPrinter does not print immediates.
+0x11 0x80 0x01 0x00
+
+# CHECK: get_local $0=, 128
+0x20 0x80 0x01
+
+# Prefix byte example:
+# CHECK: i64.trunc_u:sat/f64 $0=, $0
+0xFC 0x07
index dd0f1ef..ef4b57b 100644 (file)
@@ -21,7 +21,7 @@ static const char *symbolLookupCallback(void *DisInfo, uint64_t ReferenceValue,
   return nullptr;
 }
 
-TEST(Disassembler, Test1) {
+TEST(Disassembler, X86Test) {
   llvm::InitializeAllTargetInfos();
   llvm::InitializeAllTargetMCs();
   llvm::InitializeAllDisassemblers();
@@ -62,3 +62,46 @@ TEST(Disassembler, Test1) {
 
   LLVMDisasmDispose(DCR);
 }
+
+TEST(Disassembler, WebAssemblyTest) {
+  llvm::InitializeAllTargetInfos();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllDisassemblers();
+
+  uint8_t Bytes[] = {0x6a, 0x42, 0x7F, 0x35, 0x01, 0x10};
+  uint8_t *BytesP = Bytes;
+  const char OutStringSize = 100;
+  char OutString[OutStringSize];
+  LLVMDisasmContextRef DCR = LLVMCreateDisasm(
+      "wasm32-unknown-unknown-elf", nullptr, 0, nullptr, symbolLookupCallback);
+  if (!DCR)
+    return;
+
+  size_t InstSize;
+  unsigned NumBytes = sizeof(Bytes);
+  unsigned PC = 0;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 1U);
+  EXPECT_EQ(StringRef(OutString), "\ti32.add \t$0=, $0, $0");
+  PC += InstSize;
+  BytesP += InstSize;
+  NumBytes -= InstSize;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 2U);
+  EXPECT_EQ(StringRef(OutString), "\ti64.const\t$0=, -1");
+
+  PC += InstSize;
+  BytesP += InstSize;
+  NumBytes -= InstSize;
+
+  InstSize = LLVMDisasmInstruction(DCR, BytesP, NumBytes, PC, OutString,
+                                   OutStringSize);
+  EXPECT_EQ(InstSize, 3U);
+  EXPECT_EQ(StringRef(OutString), "\ti64.load32_u\t$0=, 16($0):p2align=1");
+
+  LLVMDisasmDispose(DCR);
+}
index 36bad44..bd7eca4 100644 (file)
@@ -44,6 +44,7 @@ add_tablegen(llvm-tblgen LLVM
   X86FoldTablesEmitter.cpp
   X86ModRMFilters.cpp
   X86RecognizableInstr.cpp
+  WebAssemblyDisassemblerEmitter.cpp
   CTagsEmitter.cpp
   )
 set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning")
index 971ad6a..b99a0a9 100644 (file)
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTarget.h"
+#include "WebAssemblyDisassemblerEmitter.h"
 #include "X86DisassemblerTables.h"
 #include "X86RecognizableInstr.h"
 #include "llvm/TableGen/Error.h"
@@ -125,6 +126,14 @@ void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
     return;
   }
 
+  // WebAssembly has variable length opcodes, so can't use EmitFixedLenDecoder
+  // below (which depends on a Size table-gen Record), and also uses a custom
+  // disassembler.
+  if (Target.getName() == "WebAssembly") {
+    emitWebAssemblyDisassemblerTables(OS, Target.getInstructionsByEnumValue());
+    return;
+  }
+
   // ARM and Thumb have a CHECK() macro to deal with DecodeStatuses.
   if (Target.getName() == "ARM" || Target.getName() == "Thumb" ||
       Target.getName() == "AArch64" || Target.getName() == "ARM64") {
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
new file mode 100644 (file)
index 0000000..df63337
--- /dev/null
@@ -0,0 +1,116 @@
+//===- WebAssemblyDisassemblerEmitter.cpp - Disassembler tables -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the WebAssembly Disassembler Emitter.
+// It contains the implementation of the disassembler tables.
+// Documentation for the disassembler emitter in general can be found in
+// WebAssemblyDisassemblerEmitter.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyDisassemblerEmitter.h"
+#include "llvm/TableGen/Record.h"
+
+namespace llvm {
+
+void emitWebAssemblyDisassemblerTables(
+    raw_ostream &OS,
+    const ArrayRef<const CodeGenInstruction *> &NumberedInstructions) {
+  // First lets organize all opcodes by (prefix) byte. Prefix 0 is the
+  // starting table.
+  std::map<unsigned,
+           std::map<unsigned, std::pair<unsigned, const CodeGenInstruction *>>>
+      OpcodeTable;
+  for (unsigned I = 0; I != NumberedInstructions.size(); ++I) {
+    auto &CGI = *NumberedInstructions[I];
+    auto &Def = *CGI.TheDef;
+    if (!Def.getValue("Inst"))
+      continue;
+    auto &Inst = *Def.getValueAsBitsInit("Inst");
+    auto Opc = static_cast<unsigned>(
+        reinterpret_cast<IntInit *>(Inst.convertInitializerTo(IntRecTy::get()))
+            ->getValue());
+    if (Opc == 0xFFFFFFFF)
+      continue; // No opcode defined.
+    assert(Opc <= 0xFFFF);
+    auto Prefix = Opc >> 8;
+    Opc = Opc & 0xFF;
+    auto &CGIP = OpcodeTable[Prefix][Opc];
+    if (!CGIP.second ||
+        // Make sure we store the variant with the least amount of operands,
+        // which is the one without explicit registers. Only few instructions
+        // have these currently, would be good to have for all of them.
+        // FIXME: this picks the first of many typed variants, which is
+        // currently the except_ref one, though this shouldn't matter for
+        // disassembly purposes.
+        CGIP.second->Operands.OperandList.size() >
+            CGI.Operands.OperandList.size()) {
+      CGIP = std::make_pair(I, &CGI);
+    }
+  }
+  OS << "#include \"MCTargetDesc/WebAssemblyMCTargetDesc.h\"\n";
+  OS << "\n";
+  OS << "namespace llvm {\n\n";
+  OS << "enum EntryType : uint8_t { ";
+  OS << "ET_Unused, ET_Prefix, ET_Instruction };\n\n";
+  OS << "struct WebAssemblyInstruction {\n";
+  OS << "  uint16_t Opcode;\n";
+  OS << "  EntryType ET;\n";
+  OS << "  uint8_t NumOperands;\n";
+  OS << "  uint8_t Operands[4];\n";
+  OS << "};\n\n";
+  // Output one table per prefix.
+  for (auto &PrefixPair : OpcodeTable) {
+    if (PrefixPair.second.empty())
+      continue;
+    OS << "WebAssemblyInstruction InstructionTable" << PrefixPair.first;
+    OS << "[] = {\n";
+    for (unsigned I = 0; I <= 0xFF; I++) {
+      auto InstIt = PrefixPair.second.find(I);
+      if (InstIt != PrefixPair.second.end()) {
+        // Regular instruction.
+        assert(InstIt->second.second);
+        auto &CGI = *InstIt->second.second;
+        OS << "  // 0x";
+        OS.write_hex(static_cast<unsigned long long>(I));
+        OS << ": " << CGI.AsmString << "\n";
+        OS << "  { " << InstIt->second.first << ", ET_Instruction, ";
+        OS << CGI.Operands.OperandList.size() << ", {\n";
+        for (auto &Op : CGI.Operands.OperandList) {
+          OS << "      " << Op.OperandType << ",\n";
+        }
+        OS << "    }\n";
+      } else {
+        auto PrefixIt = OpcodeTable.find(I);
+        // If we have a non-empty table for it that's not 0, this is a prefix.
+        if (PrefixIt != OpcodeTable.end() && I && !PrefixPair.first) {
+          OS << "  { 0, ET_Prefix, 0, {}";
+        } else {
+          OS << "  { 0, ET_Unused, 0, {}";
+        }
+      }
+      OS << "  },\n";
+    }
+    OS << "};\n\n";
+  }
+  // Create a table of all extension tables:
+  OS << "struct { uint8_t Prefix; const WebAssemblyInstruction *Table; }\n";
+  OS << "PrefixTable[] = {\n";
+  for (auto &PrefixPair : OpcodeTable) {
+    if (PrefixPair.second.empty() || !PrefixPair.first)
+      continue;
+    OS << "  { " << PrefixPair.first << ", InstructionTable"
+       << PrefixPair.first;
+    OS << " },\n";
+  }
+  OS << "  { 0, nullptr }\n};\n\n";
+  OS << "} // End llvm namespace\n";
+}
+
+} // namespace llvm
diff --git a/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h b/llvm/utils/TableGen/WebAssemblyDisassemblerEmitter.h
new file mode 100644 (file)
index 0000000..91f820f
--- /dev/null
@@ -0,0 +1,30 @@
+//===- WebAssemblyDisassemblerEmitter.h - Disassembler tables ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the WebAssembly Disassembler Emitter.
+// It contains the interface of the disassembler tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UTILS_TABLEGEN_WEBASSEMBLYDISASSEMBLEREMITTER_H
+#define LLVM_UTILS_TABLEGEN_WEBASSEMBLYDISASSEMBLEREMITTER_H
+
+#include "CodeGenInstruction.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+void emitWebAssemblyDisassemblerTables(
+    raw_ostream &OS,
+    const ArrayRef<const CodeGenInstruction *> &NumberedInstructions);
+
+} // namespace llvm
+
+#endif