}
if ((**in << len) & 0x80) return -1; // Bit after leading 1's must be 0.
if (!len) return *(*in)++;
+ // UTF-8 encoded values with a length are between 2 and 4 bytes.
+ if (len < 2 || len > 4) {
+ return -1;
+ }
// Grab initial bits of the code.
int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
for (int i = 0; i < len - 1; i++) {
ucc <<= 6;
ucc |= *(*in)++ & 0x3F; // Grab 6 more bits of the code.
}
+ // UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
+ // UTF-16 surrogate pairs).
+ if (ucc >= 0xD800 && ucc <= 0xDFFF) {
+ return -1;
+ }
+ // UTF-8 must represent code points in their shortest possible encoding.
+ switch (len) {
+ case 2:
+ // Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
+ if (ucc < 0x0080 || ucc > 0x07FF) {
+ return -1;
+ }
+ break;
+ case 3:
+ // Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
+ if (ucc < 0x0800 || ucc > 0xFFFF) {
+ return -1;
+ }
+ break;
+ case 4:
+ // Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
+ if (ucc < 0x10000 || ucc > 0x10FFFF) {
+ return -1;
+ }
+ break;
+ }
return ucc;
}
text += "]";
}
-static void EscapeString(const String &s, std::string *_text) {
+static void EscapeString(const String &s, std::string *_text, const IDLOptions& opts) {
std::string &text = *_text;
text += "\"";
for (uoffset_t i = 0; i < s.size(); i++) {
// Not printable ASCII data. Let's see if it's valid UTF-8 first:
const char *utf8 = s.c_str() + i;
int ucc = FromUTF8(&utf8);
- if (ucc >= 0x80 && ucc <= 0xFFFF) {
- // Parses as Unicode within JSON's \uXXXX range, so use that.
- text += "\\u";
- text += IntToStringHex(ucc, 4);
+ if (ucc < 0) {
+ if (opts.allow_non_utf8) {
+ text += "\\x";
+ text += IntToStringHex(static_cast<uint8_t>(c), 2);
+ } else {
+ // We previously checked for non-UTF-8 and returned a parse error,
+ // so we shouldn't reach here.
+ assert(0);
+ }
+ } else {
+ if (ucc <= 0xFFFF) {
+ // Parses as Unicode within JSON's \uXXXX range, so use that.
+ text += "\\u";
+ text += IntToStringHex(ucc, 4);
+ } else if (ucc <= 0x10FFFF) {
+ // Encode Unicode SMP values to a surrogate pair using two \u escapes.
+ uint32_t base = ucc - 0x10000;
+ uint16_t highSurrogate = (base >> 10) + 0xD800;
+ uint16_t lowSurrogate = (base & 0x03FF) + 0xDC00;
+ text += "\\u";
+ text += IntToStringHex(highSurrogate, 4);
+ text += "\\u";
+ text += IntToStringHex(lowSurrogate, 4);
+ }
// Skip past characters recognized.
i = static_cast<uoffset_t>(utf8 - s.c_str() - 1);
- } else {
- // It's either unprintable ASCII, arbitrary binary, or Unicode data
- // that doesn't fit \uXXXX, so use \xXX escape code instead.
- text += "\\x";
- text += IntToStringHex(static_cast<uint8_t>(c), 2);
}
}
break;
_text);
break;
case BASE_TYPE_STRING: {
- EscapeString(*reinterpret_cast<const String *>(val), _text);
+ EscapeString(*reinterpret_cast<const String *>(val), _text, opts);
break;
}
case BASE_TYPE_VECTOR:
void UnicodeTest() {
flatbuffers::Parser parser;
+ // Without setting allow_non_utf8 = true, we treat \x sequences as byte sequences
+ // which are then validated as UTF-8.
TEST_EQ(parser.Parse("table T { F:string; }"
"root_type T;"
"{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
- "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\" }"), true);
+ "\\u5225\\u30B5\\u30A4\\u30C8\\xE2\\x82\\xAC\\u0080\\uD83D\\uDE0E\" }"),
+ true);
std::string jsongen;
parser.opts.indent_step = -1;
GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
- TEST_EQ(jsongen == "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
- "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\"}", true);
+ TEST_EQ(jsongen,
+ std::string(
+ "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+ "\\u5225\\u30B5\\u30A4\\u30C8\\u20AC\\u0080\\uD83D\\uDE0E\"}"));
+}
+
+void UnicodeTestAllowNonUTF8() {
+ flatbuffers::Parser parser;
+ parser.opts.allow_non_utf8 = true;
+ TEST_EQ(parser.Parse("table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+ "\\u5225\\u30B5\\u30A4\\u30C8\\x01\\x80\\u0080\\uD83D\\uDE0E\" }"), true);
+ std::string jsongen;
+ parser.opts.indent_step = -1;
+ GenerateText(parser, parser.builder_.GetBufferPointer(), &jsongen);
+ TEST_EQ(jsongen,
+ std::string(
+ "{F: \"\\u20AC\\u00A2\\u30E6\\u30FC\\u30B6\\u30FC"
+ "\\u5225\\u30B5\\u30A4\\u30C8\\u0001\\x80\\u0080\\uD83D\\uDE0E\"}"));
}
void UnicodeSurrogatesTest() {
"{ F:\"\\uDC00\"}", "unpaired low surrogate");
}
+void InvalidUTF8Test() {
+ // "1 byte" pattern, under min length of 2 bytes
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\x80\"}", "illegal UTF-8 sequence");
+ // 2 byte pattern, string too short
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xDF\"}", "illegal UTF-8 sequence");
+ // 3 byte pattern, string too short
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xEF\xBF\"}", "illegal UTF-8 sequence");
+ // 4 byte pattern, string too short
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xF7\xBF\xBF\"}", "illegal UTF-8 sequence");
+ // "5 byte" pattern, string too short
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xFB\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+ // "6 byte" pattern, string too short
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xFD\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+ // "7 byte" pattern, string too short
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+ // "5 byte" pattern, over max length of 4 bytes
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xFB\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+ // "6 byte" pattern, over max length of 4 bytes
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xFD\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+ // "7 byte" pattern, over max length of 4 bytes
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xFE\xBF\xBF\xBF\xBF\xBF\xBF\"}", "illegal UTF-8 sequence");
+
+ // Three invalid encodings for U+000A (\n, aka NEWLINE)
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xC0\x8A\"}", "illegal UTF-8 sequence");
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xE0\x80\x8A\"}", "illegal UTF-8 sequence");
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xF0\x80\x80\x8A\"}", "illegal UTF-8 sequence");
+
+ // Two invalid encodings for U+00A9 (COPYRIGHT SYMBOL)
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xE0\x81\xA9\"}", "illegal UTF-8 sequence");
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xF0\x80\x81\xA9\"}", "illegal UTF-8 sequence");
+
+ // Invalid encoding for U+20AC (EURO SYMBOL)
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ "{ F:\"\xF0\x82\x82\xAC\"}", "illegal UTF-8 sequence");
+
+ // UTF-16 surrogate values between U+D800 and U+DFFF cannot be encoded in UTF-8
+ TestError(
+ "table T { F:string; }"
+ "root_type T;"
+ // U+10400 "encoded" as U+D801 U+DC00
+ "{ F:\"\xED\xA0\x81\xED\xB0\x80\"}", "illegal UTF-8 sequence");
+}
+
void UnknownFieldsTest() {
flatbuffers::IDLOptions opts;
opts.skip_unexpected_fields_in_json = true;
EnumStringsTest();
IntegerOutOfRangeTest();
UnicodeTest();
+ UnicodeTestAllowNonUTF8();
UnicodeSurrogatesTest();
UnicodeInvalidSurrogatesTest();
+ InvalidUTF8Test();
UnknownFieldsTest();
ParseUnionTest();
ConformTest();