Accept long UTF-8 Literal Strings
authorDavid Neto <dneto@google.com>
Thu, 8 Oct 2015 19:20:25 +0000 (15:20 -0400)
committerDavid Neto <dneto@google.com>
Mon, 26 Oct 2015 16:55:33 +0000 (12:55 -0400)
include/libspirv/libspirv.h
source/text.h
source/text_handler.cpp
test/TextLiteral.cpp
test/TextToBinary.Literal.cpp
test/UnitSPIRV.h

index 26534de..16f7ccb 100644 (file)
@@ -58,13 +58,17 @@ extern "C" {
 
 // SPIR-V 1.0 limits
 #define SPV_LIMIT_INSTRUCTION_WORD_COUNT_MAX 0xffff
+#define SPV_LIMIT_LITERAL_STRING_UTF8_CHARS_MAX 0xffff
+
+// A single Unicode character in UTF-8 encoding can take
+// up 4 bytes.
+#define SPV_LIMIT_LITERAL_STRING_BYTES_MAX \
+  (SPV_LIMIT_LITERAL_STRING_UTF8_CHARS_MAX * 4)
 
 // NOTE: These are set to the minimum maximum values
 // TODO(dneto): Check these.
 
 // libspirv limits.
-#define SPV_LIMIT_LITERAL_NAME_MAX 0x00000400
-#define SPV_LIMIT_LITERAL_STRING_MAX 0x00010000
 #define SPV_LIMIT_RESULT_ID_BOUND 0x00400000
 #define SPV_LIMIT_CONTROL_FLOW_NEST_DEPTH 0x00000400
 #define SPV_LIMIT_GLOBAL_VARIABLES_MAX 0x00010000
index ebb8d8c..94e5b18 100644 (file)
@@ -55,20 +55,23 @@ typedef struct spv_literal_t {
     float f;
     double d;
     // Allow room for the null terminator, and two surrounding quotes.
-    char str[SPV_LIMIT_LITERAL_STRING_MAX + 3];
+    // TODO(dneto): This is a very large array.  We should use a
+    // different kind of container.
+    char str[SPV_LIMIT_LITERAL_STRING_BYTES_MAX + 3];
   } value;
 } spv_literal_t;
 
 
 // Functions
 
-/// @brief Convert the input text to one of the number types.
+/// @brief Convert the input text to one of the number types, or to
+/// a string.
 ///
 /// String literals must be surrounded by double-quotes ("), which are
 /// then stripped.
 ///
 /// @param[in] textValue input text to parse
-/// @param[out] pLiteral the returned literal number
+/// @param[out] pLiteral the returned literal
 ///
 /// @return result code
 spv_result_t spvTextToLiteral(const char *textValue, spv_literal_t *pLiteral);
index 847df69..a0e2512 100644 (file)
@@ -342,8 +342,8 @@ spv_result_t AssemblyContext::binaryEncodeString(
 
   // TODO(dneto): We can just defer this check until later.
   if (newWordCount > SPV_LIMIT_INSTRUCTION_WORD_COUNT_MAX) {
-    diagnostic() << "Instruction word count '"
-             << SPV_LIMIT_INSTRUCTION_WORD_COUNT_MAX << "'exceeded.";
+    diagnostic() << "Instruction too long: more than "
+             << SPV_LIMIT_INSTRUCTION_WORD_COUNT_MAX << " words.";
     return SPV_ERROR_INVALID_TEXT;
   }
 
index 4f125a6..7b277b2 100644 (file)
@@ -129,18 +129,33 @@ TEST(TextLiteral, GoodString) {
 TEST(TextLiteral, StringTooLong) {
   spv_literal_t l;
   std::string too_long = std::string("\"") +
-                         std::string(SPV_LIMIT_LITERAL_STRING_MAX + 1, 'a') +
+                         std::string(SPV_LIMIT_LITERAL_STRING_BYTES_MAX + 1, 'a') +
                          "\"";
   EXPECT_EQ(SPV_ERROR_OUT_OF_MEMORY, spvTextToLiteral(too_long.data(), &l));
 }
 
 TEST(TextLiteral, GoodLongString) {
   spv_literal_t l;
-  std::string unquoted(SPV_LIMIT_LITERAL_STRING_MAX, 'a');
+  // The universal limit of 65535 Unicode characters might make this
+  // fail validation, since SPV_LIMIT_LITERAL_STRING_BYTES_MAX is 4*65535.
+  // However, as an implementation detail, we'll allow the assembler
+  // to parse it.  Otherwise we'd have to scan the string for valid UTF-8
+  // characters.
+  std::string unquoted(SPV_LIMIT_LITERAL_STRING_BYTES_MAX, 'a');
   std::string good_long = std::string("\"") + unquoted + "\"";
   EXPECT_EQ(SPV_SUCCESS, spvTextToLiteral(good_long.data(), &l));
   EXPECT_EQ(SPV_LITERAL_TYPE_STRING, l.type);
   EXPECT_STREQ(unquoted.data(), l.value.str);
 }
 
+TEST(TextLiteral, GoodUTF8String) {
+  const std::string unquoted =
+      spvtest::MakeLongUTF8String(SPV_LIMIT_LITERAL_STRING_UTF8_CHARS_MAX);
+  const std::string good_long = std::string("\"") + unquoted + "\"";
+  spv_literal_t l;
+  EXPECT_EQ(SPV_SUCCESS, spvTextToLiteral(good_long.data(), &l));
+  EXPECT_EQ(SPV_LITERAL_TYPE_STRING, l.type);
+  EXPECT_STREQ(unquoted.data(), l.value.str);
+}
+
 }  // anonymous namespace
index 1af57ec..596b645 100644 (file)
@@ -50,11 +50,56 @@ TEST_F(TextToBinaryTest, LiteralNumberInPlaceOfLiteralString) {
       CompileFailure(R"(OpSourceExtension 1000)"));
 }
 
-TEST_F(TextToBinaryTest, LiteralStringTooLong) {
+TEST_F(TextToBinaryTest, LiteralStringASCIILong) {
   // SPIR-V allows strings up to 65535 characters.
+  // Test the simple case of UTF-8 code points corresponding
+  // to ASCII characters.
+  EXPECT_EQ(65535, SPV_LIMIT_LITERAL_STRING_UTF8_CHARS_MAX);
   const std::string code =
-      "OpSourceExtension \"" + std::string(65535, 'o') + "\"\n";
+      "OpSourceExtension \"" +
+      std::string(SPV_LIMIT_LITERAL_STRING_UTF8_CHARS_MAX, 'o') + "\"\n";
   EXPECT_EQ(code, EncodeAndDecodeSuccessfully(code));
 }
 
+TEST_F(TextToBinaryTest, LiteralStringUTF8LongEncodings) {
+  // SPIR-V allows strings up to 65535 characters.
+  // Test the case of many Unicode characters, each of which has
+  // a 4-byte UTF-8 encoding.
+
+  // An instruction is at most 65535 words long. The first one
+  // contains the wordcount and opcode.  So the worst case number of
+  // 4-byte UTF-8 characters is 65533, since we also need to
+  // store a terminating null character.
+
+  // This string fits exactly into 65534 words.
+  const std::string good_string =
+      spvtest::MakeLongUTF8String(65533)
+      // The following single character has a 3 byte encoding,
+      // which fits snugly against the terminating null.
+      + "\u8000";
+
+  // These strings will overflow any instruction with 0 or 1 other
+  // arguments, respectively.
+  const std::string bad_0_arg_string = spvtest::MakeLongUTF8String(65534);
+  const std::string bad_1_arg_string = spvtest::MakeLongUTF8String(65533);
+
+  const std::string good_code = "OpSourceExtension \"" + good_string + "\"\n";
+  EXPECT_EQ(good_code, EncodeAndDecodeSuccessfully(good_code));
+
+  // Prove that it works on more than one instruction.
+  const std::string good_code_2 = "OpSourceContinued \"" + good_string + "\"\n";
+  EXPECT_EQ(good_code, EncodeAndDecodeSuccessfully(good_code));
+
+  // Failure cases.
+  EXPECT_EQ(
+      R"(Instruction too long: more than 65535 words.)",
+      CompileFailure("OpSourceExtension \"" + bad_0_arg_string + "\"\n"));
+  EXPECT_EQ(
+      R"(Instruction too long: more than 65535 words.)",
+      CompileFailure("OpSourceContinued \"" + bad_0_arg_string + "\"\n"));
+  EXPECT_EQ(
+      R"(Instruction too long: more than 65535 words.)",
+      CompileFailure("OpName %target \"" + bad_1_arg_string + "\"\n"));
+}
+
 }  // anonymous namespace
index ed39081..acd4fd2 100644 (file)
@@ -191,6 +191,21 @@ class EnumCase {
   std::vector<uint32_t> operands_;
 };
 
+// Returns a string with num_4_byte_chars Unicode characters,
+// each of which has a 4-byte UTF-8 encoding.
+inline std::string MakeLongUTF8String(size_t num_4_byte_chars) {
+  // An example of a longest valid UTF-8 character.
+  const std::string earth_africa("\U0001F30D");
+  EXPECT_EQ(4, earth_africa.size());
+  std::string result;
+  result.reserve(num_4_byte_chars * 4);
+  for (size_t i = 0; i < num_4_byte_chars; i++ ) {
+    result += earth_africa;
+  }
+  EXPECT_EQ(4 * num_4_byte_chars, result.size());
+  return result;
+}
+
 }  // namespace spvtest
 
 #endif