Validate UTF-8 by default when parsing IDL. Support Unicode values > U+FFFF in parse

author Ben Hamilton <beng@fb.com>

Mon, 1 Aug 2016 21:04:51 +0000 (14:04 -0700)

committer Ben Hamilton <beng@fb.com>

Mon, 1 Aug 2016 21:05:24 +0000 (14:05 -0700)
author Ben Hamilton <beng@fb.com>
Mon, 1 Aug 2016 21:04:51 +0000 (14:04 -0700)
committer Ben Hamilton <beng@fb.com>
Mon, 1 Aug 2016 21:05:24 +0000 (14:05 -0700)
diff --git a/include/flatbuffers/idl.h b/include/flatbuffers/idl.h

index 25706e8..5909a4e 100644 (file)
--- a/include/flatbuffers/idl.h
+++ b/include/flatbuffers/idl.h
@@ -348,6 +348,7 @@ struct IDLOptions {
    bool escape_proto_identifiers;
    bool generate_object_based_api;
    bool union_value_namespacing;
+  bool allow_non_utf8;
  
    // Possible options for the more general generator below.
    enum Language { kJava, kCSharp, kGo, kMAX };
@@ -370,6 +371,7 @@ struct IDLOptions {
        escape_proto_identifiers(false),
        generate_object_based_api(false),
        union_value_namespacing(true),
+      allow_non_utf8(false),
        lang(IDLOptions::kJava) {}
  };
  
diff --git a/include/flatbuffers/util.h b/include/flatbuffers/util.h

index 7bd7513..baf5bdd 100644 (file)
--- a/include/flatbuffers/util.h
+++ b/include/flatbuffers/util.h
@@ -276,6 +276,10 @@ inline int FromUTF8(const char **in) {
    }
    if ((**in << len) & 0x80) return -1;  // Bit after leading 1's must be 0.
    if (!len) return *(*in)++;
+  // UTF-8 encoded values with a length are between 2 and 4 bytes.
+  if (len < 2 || len > 4) {
+    return -1;
+  }
    // Grab initial bits of the code.
    int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
    for (int i = 0; i < len - 1; i++) {
@@ -283,6 +287,32 @@ inline int FromUTF8(const char **in) {
      ucc <<= 6;
      ucc |= *(*in)++ & 0x3F;  // Grab 6 more bits of the code.
    }
+  // UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
+  // UTF-16 surrogate pairs).
+  if (ucc >= 0xD800 && ucc <= 0xDFFF) {
+    return -1;
+  }
+  // UTF-8 must represent code points in their shortest possible encoding.
+  switch (len) {
+    case 2:
+      // Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
+      if (ucc < 0x0080 || ucc > 0x07FF) {
+        return -1;
+      }
+      break;
+    case 3:
+      // Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
+      if (ucc < 0x0800 || ucc > 0xFFFF) {
+        return -1;
+      }
+      break;
+    case 4:
+      // Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
+      if (ucc < 0x10000 || ucc > 0x10FFFF) {
+        return -1;
+      }
+      break;
+  }
    return ucc;
  }
  
diff --git a/src/flatc.cpp b/src/flatc.cpp

index b174cbd..44ce913 100644 (file)
--- a/src/flatc.cpp
+++ b/src/flatc.cpp
@@ -106,6 +106,9 @@ static void Error(const std::string &err, bool usage, bool show_exe_name) {
        "  --version          Print the version number of flatc and exit.\n"
        "  --strict-json      Strict JSON: field names must be / will be quoted,\n"
        "                     no trailing commas in tables/vectors.\n"
+      "  --allow-non-utf8   Pass non-UTF-8 input through parser and emit nonstandard\n"
+      "                     \\x escapes in JSON. (Default is to raise parse error on\n"
+      "                     non-UTF-8 input.)\n"
        "  --defaults-json    Output fields whose value is the default when\n"
        "                     writing JSON\n"
        "  --unknown-json     Allow fields in JSON that are not defined in the\n"
@@ -184,6 +187,8 @@ int main(int argc, const char *argv[]) {
          conform_to_schema = argv[argi];
        } else if(arg == "--strict-json") {
          opts.strict_json = true;
+      } else if(arg == "--allow-non-utf8") {
+        opts.allow_non_utf8 = true;
        } else if(arg == "--no-js-exports") {
          opts.skip_js_exports = true;
        } else if(arg == "--defaults-json") {
diff --git a/src/idl_gen_text.cpp b/src/idl_gen_text.cpp

index dd96912..3e41a0a 100644 (file)
--- a/src/idl_gen_text.cpp
+++ b/src/idl_gen_text.cpp
@@ -93,7 +93,7 @@ template<typename T> void PrintVector(const Vector<T> &v, Type type,
    text += "]";
  }
  
-static void EscapeString(const String &s, std::string *_text) {
+static void EscapeString(const String &s, std::string *_text, const IDLOptions& opts) {
    std::string &text = *_text;
    text += "\"";
    for (uoffset_t i = 0; i < s.size(); i++) {
@@ -113,17 +113,32 @@ static void EscapeString(const String &s, std::string *_text) {
            // Not printable ASCII data. Let's see if it's valid UTF-8 first:
            const char *utf8 = s.c_str() + i;
            int ucc = FromUTF8(&utf8);
-          if (ucc >= 0x80 && ucc <= 0xFFFF) {
-            // Parses as Unicode within JSON's \uXXXX range, so use that.
-            text += "\\u";
-            text += IntToStringHex(ucc, 4);
+          if (ucc < 0) {
+            if (opts.allow_non_utf8) {
+              text += "\\x";
+              text += IntToStringHex(static_cast<uint8_t>(c), 2);
+            } else {
+              // We previously checked for non-UTF-8 and returned a parse error,
+              // so we shouldn't reach here.
+              assert(0);
+            }
+          } else {
+            if (ucc <= 0xFFFF) {
+              // Parses as Unicode within JSON's \uXXXX range, so use that.
+              text += "\\u";
+              text += IntToStringHex(ucc, 4);
+            } else if (ucc <= 0x10FFFF) {
+              // Encode Unicode SMP values to a surrogate pair using two \u escapes.
+              uint32_t base = ucc - 0x10000;
+              uint16_t highSurrogate = (base >> 10) + 0xD800;
+              uint16_t lowSurrogate = (base & 0x03FF) + 0xDC00;
+              text += "\\u";
+              text += IntToStringHex(highSurrogate, 4);
+              text += "\\u";
+              text += IntToStringHex(lowSurrogate, 4);
+            }
              // Skip past characters recognized.
              i = static_cast<uoffset_t>(utf8 - s.c_str() - 1);
-          } else {
-            // It's either unprintable ASCII, arbitrary binary, or Unicode data
-            // that doesn't fit \uXXXX, so use \xXX escape code instead.
-            text += "\\x";
-            text += IntToStringHex(static_cast<uint8_t>(c), 2);
            }
          }
          break;
@@ -157,7 +172,7 @@ template<> void Print<const void *>(const void *val,
                  _text);
        break;
      case BASE_TYPE_STRING: {
-      EscapeString(*reinterpret_cast<const String *>(val), _text);
+      EscapeString(*reinterpret_cast<const String *>(val), _text, opts);
        break;
      }
      case BASE_TYPE_VECTOR:
diff --git a/src/idl_parser.cpp b/src/idl_parser.cpp

index b03655c..d845b83 100644 (file)
--- a/src/idl_parser.cpp
+++ b/src/idl_parser.cpp
@@ -61,6 +61,17 @@ static_assert(BASE_TYPE_UNION ==
  #define NEXT() ECHECK(Next())
  #define EXPECT(tok) ECHECK(Expect(tok))
  
+static bool ValidateUTF8(const std::string &str) {
+  const char *s = &str[0];
+  const char * const sEnd = s + str.length();
+  while (s < sEnd) {
+    if (FromUTF8(&s) < 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
  CheckedError Parser::Error(const std::string &msg) {
    error_ = file_being_parsed_.length() ? AbsolutePath(file_being_parsed_) : "";
    #ifdef _WIN32
@@ -320,6 +331,9 @@ CheckedError Parser::Next() {
              "illegal Unicode sequence (unpaired high surrogate)");
          }
          cursor_++;
+        if (!opts.allow_non_utf8 && !ValidateUTF8(attribute_)) {
+          return Error("illegal UTF-8 sequence");
+        }
          token_ = kTokenStringConstant;
          return NoError();
        }
diff --git a/tests/test.cpp b/tests/test.cpp

index 6ec4e67..cd37237 100644 (file)
--- a/tests/test.cpp
+++ b/tests/test.cpp
@@ -978,15 +978,36 @@ void IntegerOutOfRangeTest() {
  
  void UnicodeTest() {
    flatbuffers::Parser parser;
+  // Without setting allow_non_utf8 = true, we treat \x sequences as byte sequences
+  // which are then validated as UTF-8.
    TEST_EQ(parser.Parse("table T { F:string; }"
                         "root_type T;"
                         "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
-                       "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\" }"), true);
+                       "\\u5225\\u30B5\\u30A4\\u30C8\\xE2\\x82\\xAC\\u0080\\uD83D\\uDE0E\" }"),
+          true);
    std::string jsongen;
    parser.opts.indent_step = -1;
    GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
-  TEST_EQ(jsongen == "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
-                     "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
+  TEST_EQ(jsongen,
+          std::string(
+            "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+            "\\u5225\\u30B5\\u30A4\\u30C8\\u20AC\\u0080\\uD83D\\uDE0E\"}"));
+}
+
+void UnicodeTestAllowNonUTF8() {
+  flatbuffers::Parser parser;
+  parser.opts.allow_non_utf8 = true;
+  TEST_EQ(parser.Parse("table T { F:string; }"
+                       "root_type T;"
+                       "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+                       "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\\u0080\\uD83D\\uDE0E\" }"), true);
+  std::string jsongen;
+  parser.opts.indent_step = -1;
+  GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
+  TEST_EQ(jsongen,
+          std::string(
+            "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+            "\\u5225\\u30B5\\u30A4\\u30C8\\u0001\\x80\\u0080\\uD83D\\uDE0E\"}"));
  }
  
  void UnicodeSurrogatesTest() {
@@ -1027,6 +1048,96 @@ void UnicodeInvalidSurrogatesTest() {
      "{ F:\"\\uDC00\"}", "unpaired low surrogate");
  }
  
+void InvalidUTF8Test() {
+  // "1 byte" pattern, under min length of 2 bytes
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\x80\"}", "illegal UTF-8 sequence");
+  // 2 byte pattern, string too short
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xDF\"}", "illegal UTF-8 sequence");
+  // 3 byte pattern, string too short
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xEF\xBF\"}", "illegal UTF-8 sequence");
+  // 4 byte pattern, string too short
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xF7\xBF\xBF\"}", "illegal UTF-8 sequence");
+  // "5 byte" pattern, string too short
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xFB\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+  // "6 byte" pattern, string too short
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xFD\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+  // "7 byte" pattern, string too short
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+  // "5 byte" pattern, over max length of 4 bytes
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xFB\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+  // "6 byte" pattern, over max length of 4 bytes
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xFD\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+  // "7 byte" pattern, over max length of 4 bytes
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+
+  // Three invalid encodings for U+000A (\n, aka NEWLINE)
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xC0\x8A\"}", "illegal UTF-8 sequence");
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xE0\x80\x8A\"}", "illegal UTF-8 sequence");
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xF0\x80\x80\x8A\"}", "illegal UTF-8 sequence");
+
+  // Two invalid encodings for U+00A9 (COPYRIGHT SYMBOL)
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xE0\x81\xA9\"}", "illegal UTF-8 sequence");
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xF0\x80\x81\xA9\"}", "illegal UTF-8 sequence");
+
+  // Invalid encoding for U+20AC (EURO SYMBOL)
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    "{ F:\"\xF0\x82\x82\xAC\"}", "illegal UTF-8 sequence");
+
+  // UTF-16 surrogate values between U+D800 and U+DFFF cannot be encoded in UTF-8
+  TestError(
+    "table T { F:string; }"
+    "root_type T;"
+    // U+10400 "encoded" as U+D801 U+DC00
+    "{ F:\"\xED\xA0\x81\xED\xB0\x80\"}", "illegal UTF-8 sequence");
+}
+
  void UnknownFieldsTest() {
    flatbuffers::IDLOptions opts;
    opts.skip_unexpected_fields_in_json = true;
@@ -1105,8 +1216,10 @@ int main(int /*argc*/, const char * /*argv*/[]) {
    EnumStringsTest();
    IntegerOutOfRangeTest();
    UnicodeTest();
+  UnicodeTestAllowNonUTF8();
    UnicodeSurrogatesTest();
    UnicodeInvalidSurrogatesTest();
+  InvalidUTF8Test();
    UnknownFieldsTest();
    ParseUnionTest();
    ConformTest();
author	Ben Hamilton <beng@fb.com>
	Mon, 1 Aug 2016 21:04:51 +0000 (14:04 -0700)
committer	Ben Hamilton <beng@fb.com>
	Mon, 1 Aug 2016 21:05:24 +0000 (14:05 -0700)
include/flatbuffers/idl.h		patch \| blob \| history
include/flatbuffers/util.h		patch \| blob \| history
src/flatc.cpp		patch \| blob \| history
src/idl_gen_text.cpp		patch \| blob \| history
src/idl_parser.cpp		patch \| blob \| history
tests/test.cpp		patch \| blob \| history