From 7bff3eb6f9abb1c126029d456b152429902dc1d8 Mon Sep 17 00:00:00 2001
From: David Neto <dneto@google.com>
Date: Fri, 20 Nov 2015 14:21:10 -0500
Subject: [PATCH] spv_parsed_instruction_t cleanup

Add members:
 - words: a pointer to an array of words in the instruction,
   in host native endianness.
 - num_words: sizes the words member

Remove member:
 - offset

This simplifies clients of spvBinaryParse, because they don't
have to handle endianness translation.

Also, it makes the binary parse API more composable, allowing
for easy chaining of binary parse clients.  A binary parse client
is handed the array of words directly instead of having to reference
some external array of all the words in the SPIR-V binary.  It also
allows a binary parse client to mutate the instruction stream before
handing off to a downstream consumer.

TODO(dneto): Still need to write the unit tests for spvBinaryParse

Fixes: https://github.com/KhronosGroup/SPIRV-Tools/issues/1
---
 include/libspirv/libspirv.h |  12 ++--
 source/binary.cpp           | 136 ++++++++++++++++++++++++++++++++------------
 source/disassemble.cpp      |  30 +++++-----
 source/endian.cpp           |   7 +++
 source/endian.h             |   3 +
 5 files changed, 134 insertions(+), 54 deletions(-)
diff --git a/include/libspirv/libspirv.h b/include/libspirv/libspirv.h
index e1e71e2..f254787 100644
--- a/include/libspirv/libspirv.h
+++ b/include/libspirv/libspirv.h
@@ -283,8 +283,10 @@ typedef struct spv_parsed_operand_t {
 
 // An instruction parsed from a binary SPIR-V module.
 typedef struct spv_parsed_instruction_t {
-  // Location of the instruction, in words from the start of the SPIR-V binary.
-  size_t offset;
+  // An array of words for this instruction, in native endianness.
+  const uint32_t* words;
+  // The number of words in this instruction.
+  uint16_t num_words;
   SpvOp opcode;
   // The extended instruction type, if opcode is OpExtInst.  Otherwise
   // this is the "none" value.
@@ -401,8 +403,10 @@ typedef spv_result_t (*spv_parsed_header_fn_t)(
 
 // A pointer to a function that accepts a parsed SPIR-V instruction.
 // The parsed_instruction value is transient: it may be overwritten
-// or released immediately after the function has returned.  The function
-// should return SPV_SUCCESS if and only if parsing should continue.
+// or released immediately after the function has returned.  That also
+// applies to the words array member of the parsed instruction.  The
+// function should return SPV_SUCCESS if and only if parsing should
+// continue.
 typedef spv_result_t (*spv_parsed_instruction_fn_t)(
     void* user_data, const spv_parsed_instruction_t* parsed_instruction);
 
diff --git a/source/binary.cpp b/source/binary.cpp
index 1af785e..c589b6b 100755
--- a/source/binary.cpp
+++ b/source/binary.cpp
@@ -26,6 +26,7 @@
 
 #include "binary.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cstring>
 #include <limits>
@@ -136,13 +137,19 @@ class Parser {
   // On failure, returns an error code and issues a diagnostic.
   spv_result_t parseInstruction();
 
-  // Parses an instruction operand with the given type.
-  // May update the expected_operands parameter, and the scalar members of the
-  // inst parameter. On success, returns SPV_SUCCESS, advances past the
-  // operand, and pushes a new entry on to the operands vector.  Otherwise
-  // returns an error code and issues a diagnostic.
-  spv_result_t parseOperand(spv_parsed_instruction_t* inst,
+  // Parses an instruction operand with the given type, for an instruction
+  // starting at inst_offset words into the SPIR-V binary.
+  // If the SPIR-V binary is the same endianness as the host, then the
+  // endian_converted_inst_words parameter is ignored.  Otherwise, this method
+  // appends the words for this operand, converted to host native endianness,
+  // to the end of endian_converted_inst_words.  This method also updates the
+  // expected_operands parameter, and the scalar members of the inst parameter.
+  // On success, returns SPV_SUCCESS, advances past the operand, and pushes a
+  // new entry on to the operands vector.  Otherwise returns an error code and
+  // issues a diagnostic.
+  spv_result_t parseOperand(size_t inst_offset, spv_parsed_instruction_t* inst,
                             const spv_operand_type_t type,
+                            std::vector<uint32_t>* endian_converted_inst_words,
                             std::vector<spv_parsed_operand_t>* operands,
                             spv_operand_pattern_t* expected_operands);
 
@@ -154,17 +161,19 @@ class Parser {
   spv_result_t setNumericTypeInfoForType(spv_parsed_operand_t* parsed_operand,
                                          uint32_t type_id);
 
-  // Records the number type for an instruction if that instruction generates
-  // a type.  For types that aren't scalar numbers, record something with
-  // number kind SPV_NUMBER_NONE.
-  void recordNumberType(const spv_parsed_instruction_t* inst);
+  // Records the number type for an instruction at the given offset, if that
+  // instruction generates a type.  For types that aren't scalar numbers,
+  // record something with number kind SPV_NUMBER_NONE.
+  void recordNumberType(size_t inst_offset,
+                        const spv_parsed_instruction_t* inst);
 
   // Returns a diagnostic stream object initialized with current position in
   // the input stream, and for the given error code. Any data written to the
   // returned object will be propagated to the current parse's diagnostic
   // object.
   libspirv::DiagnosticStream diagnostic(spv_result_t error) {
-    return libspirv::DiagnosticStream({0, 0, _.word_index}, _.diagnostic, error);
+    return libspirv::DiagnosticStream({0, 0, _.word_index}, _.diagnostic,
+                                      error);
   }
 
   // Returns a diagnostic stream object with the default parse error code.
@@ -211,6 +220,9 @@ class Parser {
     spv_diagnostic* diagnostic;  // Where diagnostics go.
     size_t word_index;           // The current position in words.
     spv_endianness_t endian;     // The endianness of the binary.
+    // Is the SPIR-V binary in a different endiannes from the host native
+    // endianness?
+    bool requires_endian_conversion;
 
     // Maps a result ID to its type ID.  By convention:
     //  - a result ID that is a type definition maps to itself.
@@ -249,6 +261,7 @@ spv_result_t Parser::parseModule() {
     return diagnostic() << "Invalid SPIR-V magic number '" << std::hex
                         << _.words[0] << "'.";
   }
+  _.requires_endian_conversion = !spvIsHostEndian(_.endian);
 
   // Process the header.
   spv_header_t header;
@@ -281,13 +294,24 @@ spv_result_t Parser::parseInstruction() {
   // The zero values for all members except for opcode are the
   // correct initial values.
   spv_parsed_instruction_t inst = {};
-  inst.offset = _.word_index;
+
+  const uint32_t first_word = peek();
+
+  // TODO(dneto): If it's too expensive to construct the following "words"
+  // and "operands" vectors for each instruction, each instruction, then make
+  // them class data members instead, and clear them here.
+
+  // If the module's endianness is different from the host native endianness,
+  // then converted_words contains the the endian-translated words in the
+  // instruction.
+  std::vector<uint32_t> endian_converted_words = {first_word};
+  if (_.requires_endian_conversion) {
+    // Most instructions have fewer than 25 words.
+    endian_converted_words.reserve(25);
+  }
 
   // After a successful parse of the instruction, the inst.operands member
   // will point to this vector's storage.
-  // TODO(dneto): If it's too expensive to construct the operands vector for
-  // each instruction, then make this a class data member instead, and clear it
-  // here.
   std::vector<spv_parsed_operand_t> operands;
   // Most instructions have fewer than 25 logical operands.
   operands.reserve(25);
@@ -295,7 +319,7 @@ spv_result_t Parser::parseInstruction() {
   assert(_.word_index < _.num_words);
   // Decompose and check the first word.
   uint16_t inst_word_count = 0;
-  spvOpcodeSplit(peek(), &inst_word_count, &inst.opcode);
+  spvOpcodeSplit(first_word, &inst_word_count, &inst.opcode);
   if (inst_word_count < 1) {
     return diagnostic() << "Invalid instruction word count: "
                         << inst_word_count;
@@ -304,6 +328,9 @@ spv_result_t Parser::parseInstruction() {
   if (grammar_.lookupOpcode(inst.opcode, &opcode_desc))
     return diagnostic() << "Invalid opcode: " << int(inst.opcode);
 
+  // Advance past the opcode word.  But remember the of the start
+  // of the instruction.
+  const size_t inst_offset = _.word_index;
   _.word_index++;
 
   // Maintains the ordered list of expected operand types.
@@ -317,11 +344,11 @@ spv_result_t Parser::parseInstruction() {
       opcode_desc->operandTypes,
       opcode_desc->operandTypes + opcode_desc->numTypes);
 
-  while (_.word_index < inst.offset + inst_word_count) {
-    const uint16_t inst_word_index = uint16_t(_.word_index - inst.offset);
+  while (_.word_index < inst_offset + inst_word_count) {
+    const uint16_t inst_word_index = uint16_t(_.word_index - inst_offset);
     if (expected_operands.empty()) {
       return diagnostic() << "Invalid instruction Op" << opcode_desc->name
-                          << " starting at word " << inst.offset
+                          << " starting at word " << inst_offset
                           << ": expected no more operands after "
                           << inst_word_index
                           << " words, but stated word count is "
@@ -330,7 +357,9 @@ spv_result_t Parser::parseInstruction() {
 
     spv_operand_type_t type = spvTakeFirstMatchableOperand(&expected_operands);
 
-    if (auto error = parseOperand(&inst, type, &operands, &expected_operands))
+    if (auto error =
+            parseOperand(inst_offset, &inst, type, &endian_converted_words,
+                         &operands, &expected_operands))
       return error;
   }
 
@@ -338,21 +367,33 @@ spv_result_t Parser::parseInstruction() {
       !spvOperandIsOptional(expected_operands.front())) {
     return diagnostic() << "End of input reached while decoding Op"
                         << opcode_desc->name << " starting at word "
-                        << inst.offset << ": expected more operands after "
+                        << inst_offset << ": expected more operands after "
                         << inst_word_count << " words.";
   }
 
-  if ((inst.offset + inst_word_count) != _.word_index) {
+  if ((inst_offset + inst_word_count) != _.word_index) {
     return diagnostic() << "Invalid word count: Instruction starting at word "
-                        << inst.offset << " says it has " << inst_word_count
-                        << " words, but found " << _.word_index - inst.offset
+                        << inst_offset << " says it has " << inst_word_count
+                        << " words, but found " << _.word_index - inst_offset
                         << " words instead.";
   }
+  assert(inst_word_count == words.size());
 
-  recordNumberType(&inst);
+  recordNumberType(inst_offset, &inst);
 
-  // Must wait until here to set the inst.operands pointer because the vector
-  // might be resized while we accumulate itse elements.
+  if (_.requires_endian_conversion) {
+    // We must wait until here to set this pointer, because the vector might
+    // have been be resized while we accumulated its elements.
+    inst.words = endian_converted_words.data();
+  } else {
+    // If no conversion is required, then just point to the underlying binary.
+    // This saves time and space.
+    inst.words = _.words + inst_offset;
+  }
+  inst.num_words = inst_word_count;
+
+  // We must wait until here to set this pointer, because the vector might
+  // have been be resized while we accumulated its elements.
   inst.operands = operands.data();
   inst.num_operands = uint16_t(operands.size());
 
@@ -365,13 +406,15 @@ spv_result_t Parser::parseInstruction() {
   return SPV_SUCCESS;
 }
 
-spv_result_t Parser::parseOperand(spv_parsed_instruction_t* inst,
+spv_result_t Parser::parseOperand(size_t inst_offset,
+                                  spv_parsed_instruction_t* inst,
                                   const spv_operand_type_t type,
+                                  std::vector<uint32_t>* words,
                                   std::vector<spv_parsed_operand_t>* operands,
                                   spv_operand_pattern_t* expected_operands) {
   // We'll fill in this result as we go along.
   spv_parsed_operand_t parsed_operand;
-  parsed_operand.offset = uint16_t(_.word_index - inst->offset);
+  parsed_operand.offset = uint16_t(_.word_index - inst_offset);
   // Most operands occupy one word.  This might be be adjusted later.
   parsed_operand.num_words = 1;
   // The type argument is the one used by the grammar to parse the instruction.
@@ -387,6 +430,10 @@ spv_result_t Parser::parseOperand(spv_parsed_instruction_t* inst,
 
   const uint32_t word = peek();
 
+  // Do the words in this operand have to be converted to native endianness?
+  // True for all but literal strings.
+  bool convert_operand_endianness = true;
+
   switch (type) {
     case SPV_OPERAND_TYPE_TYPE_ID:
       if (!word) return diagnostic() << "Error: Type Id is 0";
@@ -478,7 +525,7 @@ spv_result_t Parser::parseOperand(spv_parsed_instruction_t* inst,
       if (inst->opcode == SpvOpSwitch) {
         // The literal operands have the same type as the value
         // referenced by the selector Id.
-        const uint32_t selector_id = peekAt(inst->offset + 1);
+        const uint32_t selector_id = peekAt(inst_offset + 1);
         auto type_id_iter = _.id_to_type_id.find(selector_id);
         if (type_id_iter == _.id_to_type_id.end()) {
           return diagnostic() << "Invalid OpSwitch: selector id " << selector_id
@@ -513,7 +560,7 @@ spv_result_t Parser::parseOperand(spv_parsed_instruction_t* inst,
 
     case SPV_OPERAND_TYPE_LITERAL_STRING:
     case SPV_OPERAND_TYPE_OPTIONAL_LITERAL_STRING: {
-      // TODO(dneto): Make and use spvFixupString();
+      convert_operand_endianness = false;
       const char* string =
           reinterpret_cast<const char*>(_.words + _.word_index);
       size_t string_num_words = (strlen(string) / 4) + 1;  // Account for null.
@@ -631,7 +678,23 @@ spv_result_t Parser::parseOperand(spv_parsed_instruction_t* inst,
 
   operands->push_back(parsed_operand);
 
-  _.word_index += parsed_operand.num_words;
+  const size_t index_after_operand = _.word_index + parsed_operand.num_words;
+  if (_.requires_endian_conversion) {
+    // Copy instruction words.  Translate to native endianness as needed.
+    if (convert_operand_endianness) {
+      const spv_endianness_t endianness = _.endian;
+      std::transform(_.words + _.word_index, _.words + index_after_operand,
+                     words->end(), [endianness](const uint32_t word) {
+                       return spvFixWord(word, endianness);
+                     });
+    } else {
+      words->insert(words->end(), _.words + _.word_index,
+                    _.words + index_after_operand);
+    }
+  }
+
+  // Advance past the operand.
+  _.word_index = index_after_operand;
 
   return SPV_SUCCESS;
 }
@@ -656,16 +719,17 @@ spv_result_t Parser::setNumericTypeInfoForType(
   return SPV_SUCCESS;
 }
 
-void Parser::recordNumberType(const spv_parsed_instruction_t* inst) {
+void Parser::recordNumberType(size_t inst_offset,
+                              const spv_parsed_instruction_t* inst) {
   if (spvOpcodeGeneratesType(inst->opcode)) {
     NumberType info = {SPV_NUMBER_NONE, 0};
     if (SpvOpTypeInt == inst->opcode) {
-      const bool is_signed = peekAt(inst->offset + 3) != 0;
+      const bool is_signed = peekAt(inst_offset + 3) != 0;
       info.type = is_signed ? SPV_NUMBER_SIGNED_INT : SPV_NUMBER_UNSIGNED_INT;
-      info.bit_width = peekAt(inst->offset + 2);
+      info.bit_width = peekAt(inst_offset + 2);
     } else if (SpvOpTypeFloat == inst->opcode) {
       info.type = SPV_NUMBER_FLOATING;
-      info.bit_width = peekAt(inst->offset + 2);
+      info.bit_width = peekAt(inst_offset + 2);
     }
     // The *result* Id of a type generating instruction is the type Id.
     _.type_id_to_number_type_info[inst->result_id] = info;
diff --git a/source/disassemble.cpp b/source/disassemble.cpp
index b773256..2e16177 100644
--- a/source/disassemble.cpp
+++ b/source/disassemble.cpp
@@ -47,10 +47,6 @@ namespace {
 // A Disassembler instance converts a SPIR-V binary to its assembly
 // representation.
 class Disassembler {
-  enum { kStandardIndent = 15 };
-
-  using out_stream = libspirv::out_stream;
-
  public:
   Disassembler(const libspirv::AssemblyGrammar& grammar, uint32_t const* words,
                uint32_t options)
@@ -80,6 +76,10 @@ class Disassembler {
   spv_result_t SaveTextResult(spv_text* text_result) const;
 
  private:
+  enum { kStandardIndent = 15 };
+
+  using out_stream = libspirv::out_stream;
+
   // Emits an operand for the given instruction, where the instruction
   // is at offset words from the start of the binary.
   void EmitOperand(const spv_parsed_instruction_t& inst,
@@ -197,8 +197,7 @@ void Disassembler::EmitOperand(const spv_parsed_instruction_t& inst,
                                const uint16_t operand_index) {
   assert(operand_index < inst.num_operands);
   const spv_parsed_operand_t& operand = inst.operands[operand_index];
-  const size_t index = inst.offset + operand.offset;
-  const uint32_t word = spvFixWord(words_[index], endian_);
+  const uint32_t word = inst.words[operand.offset];
   switch (operand.type) {
     case SPV_OPERAND_TYPE_RESULT_ID:
       assert(false && "<result-id> is not supposed to be handled here");
@@ -235,7 +234,7 @@ void Disassembler::EmitOperand(const spv_parsed_instruction_t& inst,
             stream_ << int32_t(word);
             break;
           case SPV_NUMBER_UNSIGNED_INT:
-            stream_ << uint32_t(word);
+            stream_ << word;
             break;
           case SPV_NUMBER_FLOATING:
             // Assume only 32-bit floats.
@@ -246,14 +245,15 @@ void Disassembler::EmitOperand(const spv_parsed_instruction_t& inst,
             assert(false && "Unreachable");
         }
       } else if (operand.num_words == 2) {
+        // Multi-word numbers are presented with lower order words first.
         uint64_t bits =
-            spvFixDoubleWord(words_[index], words_[index + 1], endian_);
+            uint64_t(word) | (uint64_t(inst.words[operand.offset + 1]) << 32);
         switch (operand.number_kind) {
           case SPV_NUMBER_SIGNED_INT:
             stream_ << int64_t(bits);
             break;
           case SPV_NUMBER_UNSIGNED_INT:
-            stream_ << uint64_t(bits);
+            stream_ << bits;
             break;
           case SPV_NUMBER_FLOATING:
             // Assume only 64-bit floats.
@@ -268,13 +268,15 @@ void Disassembler::EmitOperand(const spv_parsed_instruction_t& inst,
       }
     } break;
     case SPV_OPERAND_TYPE_LITERAL_STRING: {
-      // Strings are always little-endian.
-      const std::string string(reinterpret_cast<const char*>(&words_[index]));
       stream_ << "\"";
       SetGreen();
-      for (auto ch : string) {
-        if (ch == '"' || ch == '\\') stream_ << '\\';
-        stream_ << ch;
+      // Strings are always little-endian, and null-terminated.
+      // Write out the characters, escaping as needed, and without copying
+      // the entire string.
+      auto c_str = reinterpret_cast<const char*>(inst.words + operand.offset);
+      for (auto p = c_str; *p; ++p) {
+        if (*p == '"' || *p == '\\') stream_ << '\\';
+        stream_ << *p;
       }
       ResetColor();
       stream_ << '"';
diff --git a/source/endian.cpp b/source/endian.cpp
index e0b2b2b..39ecbcc 100644
--- a/source/endian.cpp
+++ b/source/endian.cpp
@@ -78,3 +78,10 @@ spv_result_t spvBinaryEndianness(spv_const_binary binary,
 
   return SPV_ERROR_INVALID_BINARY;
 }
+
+bool spvIsHostEndian(spv_endianness_t endian) {
+  return ((SPV_ENDIANNESS_LITTLE == endian) &&
+          (I32_ENDIAN_LITTLE == I32_ENDIAN_HOST)) ||
+         ((SPV_ENDIANNESS_BIG == endian) &&
+          (I32_ENDIAN_BIG == I32_ENDIAN_HOST));
+}
diff --git a/source/endian.h b/source/endian.h
index c175cf4..5c25f33 100644
--- a/source/endian.h
+++ b/source/endian.h
@@ -43,4 +43,7 @@ uint64_t spvFixDoubleWord(const uint32_t low, const uint32_t high,
 spv_result_t spvBinaryEndianness(const spv_const_binary binary,
                                  spv_endianness_t* endian);
 
+// Returns true if the given endianness matches the host's native endiannes.
+bool spvIsHostEndian(spv_endianness_t endian);
+
 #endif  // LIBSPIRV_ENDIAN_H_
-- 
2.7.4