From 9140144d5161124623a27cf8b8038f6e7c9bb74d Mon Sep 17 00:00:00 2001
From: Wouter van Oortmerssen <wvo@google.com>
Date: Mon, 7 Jul 2014 17:34:23 -0700
Subject: [PATCH] Added functionality to assign field ids manually in a schema

New attribute:

-   `id: n` (on a table field): manually set the field identifier to `n`.
    If you use this attribute, you must use it on ALL fields of this table,
    and the numbers must be a contiguous range from 0 onwards.
    Additionally, since a union type effectively adds two fields, its
    id must be that of the second field (the first field is the type
    field and not explicitly declared in the schema).
    For example, if the last field before the union field had id 6,
    the union field should have id 8, and the unions type field will
    implicitly be 7.
    IDs allow the fields to be placed in any order in the schema.
    When a new field is added to the schema is must use the next available ID.

Change-Id: I8690f105f3a2d31fdcb75a4fab4130692b12c62f
Tested: on Windows
---
 docs/html/md__cpp_usage.html      | 13 +++++++++--
 docs/html/md__schemas.html        |  9 ++++----
 docs/source/Schemas.md            | 21 ++++++++++++++++--
 src/flatc.cpp                     |  5 +++--
 src/idl_parser.cpp                | 46 ++++++++++++++++++++++++++++++++++++++-
 tests/MyGame/Example/Monster.java |  2 +-
 tests/monster_test.fbs            | 22 +++++++++----------
 tests/monster_test_generated.h    |  2 +-
 8 files changed, 96 insertions(+), 24 deletions(-)
diff --git a/docs/html/md__cpp_usage.html b/docs/html/md__cpp_usage.html
index 386fffc..f02436e 100644
--- a/docs/html/md__cpp_usage.html
+++ b/docs/html/md__cpp_usage.html
@@ -64,9 +64,9 @@ auto inventory = fbb.CreateVector(inv, 10);
 <p><code>CreateString</code> can also take an <code>std::string</code>, or a <code>const char *</code> with an explicit length, and is suitable for holding UTF-8 and binary data if needed.</p>
 <p><code>CreateVector</code> can also take an <code>std::vector</code>. The offset it returns is typed, i.e. can only be used to set fields of the correct type below. To create a vector of struct objects (which will be stored as contiguous memory in the buffer, use <code>CreateVectorOfStructs</code> instead. </p><pre class="fragment">Vec3 vec(1, 2, 3);
 </pre><p><code>Vec3</code> is the first example of code from our generated header. Structs (unlike tables) translate to simple structs in C++, so we can construct them in a familiar way.</p>
-<p>We have now serialized the non-scalar components of of the monster example, so we could create the monster something like this: </p><pre class="fragment">auto mloc = CreateMonster(fbb, &amp;vec, 150, 80, name, inventory, Color_Red, Offset&lt;void&gt;(0), Any_NONE);
+<p>We have now serialized the non-scalar components of of the monster example, so we could create the monster something like this: </p><pre class="fragment">auto mloc = CreateMonster(fbb, &amp;vec, 150, 80, name, inventory, Color_Red, 0, Any_NONE);
 </pre><p>Note that we're passing <code>150</code> for the <code>mana</code> field, which happens to be the default value: this means the field will not actually be written to the buffer, since we'll get that value anyway when we query it. This is a nice space savings, since it is very common for fields to be at their default. It means we also don't need to be scared to add fields only used in a minority of cases, since they won't bloat up the buffer sizes if they're not actually used.</p>
-<p>We do something similarly for the union field <code>test</code> by specifying a <code>0</code> offset and the <code>NONE</code> enum value (part of every union) to indicate we don't actually want to write this field.</p>
+<p>We do something similarly for the union field <code>test</code> by specifying a <code>0</code> offset and the <code>NONE</code> enum value (part of every union) to indicate we don't actually want to write this field. You can use <code>0</code> also as a default for other non-scalar types, such as strings, vectors and tables.</p>
 <p>Tables (like <code>Monster</code>) give you full flexibility on what fields you write (unlike <code>Vec3</code>, which always has all fields set because it is a <code>struct</code>). If you want even more control over this (i.e. skip fields even when they are not default), instead of the convenient <code>CreateMonster</code> call we can also build the object field-by-field manually: </p><pre class="fragment">MonsterBuilder mb(fbb);
 mb.add_pos(&amp;vec);
 mb.add_hp(80);
@@ -95,6 +95,15 @@ assert(inv-&gt;Get(9) == 9);
 <p>For structs, layout is deterministic and guaranteed to be the same accross platforms (scalars are aligned to their own size, and structs themselves to their largest member), and you are allowed to access this memory directly by using <code>sizeof()</code> and <code>memcpy</code> on the pointer to a struct, or even an array of structs.</p>
 <p>To compute offsets to sub-elements of a struct, make sure they are a structs themselves, as then you can use the pointers to figure out the offset without having to hardcode it. This is handy for use of arrays of structs with calls like <code>glVertexAttribPointer</code> in OpenGL or similar APIs.</p>
 <p>It is important to note is that structs are still little endian on all machines, so only use tricks like this if you can guarantee you're not shipping on a big endian machine (an <code>assert(FLATBUFFERS_LITTLEENDIAN)</code> would be wise).</p>
+<h3>Access of untrusted buffers</h3>
+<p>The generated accessor functions access fields over offsets, which is very quick. These offsets are not verified at run-time, so a malformed buffer could cause a program to crash by accessing random memory.</p>
+<p>When you're processing large amounts of data from a source you know (e.g. your own generated data on disk), this is acceptable, but when reading data from the network that can potentially have been modified by an attacker, this is undesirable.</p>
+<p>For this reason, you can optionally use a buffer verifier before you access the data. This verifier will check all offsets, all sizes of fields, and null termination of strings to ensure that when a buffer is accessed, all reads will end up inside the buffer.</p>
+<p>Each root type will have a verification function generated for it, e.g. for <code>Monster</code>, you can call:</p>
+<p>bool ok = VerifyMonsterBuffer(Verifier(buf, len));</p>
+<p>if <code>ok</code> is true, the buffer is safe to read.</p>
+<p>Besides untrusted data, this function may be useful to call in debug mode, as extra insurance against data being corrupted somewhere along the way.</p>
+<p>While verifying a buffer isn't "free", it is typically faster than a full traversal (since any scalar data is not actually touched), and since it may cause the buffer to be brought into cache before reading, the actual overhead may be even lower than expected.</p>
 <h2>Text &amp; schema parsing</h2>
 <p>Using binary buffers with the generated header provides a super low overhead use of FlatBuffer data. There are, however, times when you want to use text formats, for example because it interacts better with source control, or you want to give your users easy access to data.</p>
 <p>Another reason might be that you already have a lot of data in JSON format, or a tool that generates JSON, and if you can write a schema for it, this will provide you an easy way to use that data directly.</p>
diff --git a/docs/html/md__schemas.html b/docs/html/md__schemas.html
index 52bfc75..57f7b27 100644
--- a/docs/html/md__schemas.html
+++ b/docs/html/md__schemas.html
@@ -84,7 +84,7 @@ root_type Monster;
 <p>Tables are the main way of defining objects in FlatBuffers, and consist of a name (here <code>Monster</code>) and a list of fields. Each field has a name, a type, and optionally a default value (if omitted, it defaults to 0 / NULL).</p>
 <p>Each field is optional: It does not have to appear in the wire representation, and you can choose to omit fields for each individual object. As a result, you have the flexibility to add fields without fear of bloating your data. This design is also FlatBuffer's mechanism for forward and backwards compatibility. Note that:</p>
 <ul>
-<li>You can add new fields in the schema ONLY at the end of a table definition. Older data will still read correctly, and give you the default value when read. Older code will simply ignore the new field.</li>
+<li>You can add new fields in the schema ONLY at the end of a table definition. Older data will still read correctly, and give you the default value when read. Older code will simply ignore the new field. If you want to have flexibility to use any order for fields in your schema, you can manually assign ids (much like protocol buffer), see the <code>id</code> attribute below.</li>
 <li>You cannot delete fields you don't use anymore from the schema, but you can simply stop writing them into your data for almost the same effect. Additionally you can mark them as <code>deprecated</code> as in the example above, which will prevent the generation of accessors in the generated C++, as a way to enforce the field not being used any more. (careful: this may break code!).</li>
 <li>You may change field names and table names, if you're ok with your code breaking until you've renamed them there too.</li>
 </ul>
@@ -106,7 +106,7 @@ root_type Monster;
 <p>Values are a sequence of digits, optionally followed by a <code>.</code> and more digits for float constants, and optionally prefixed by a <code>-</code>. Non-scalar defaults are currently not supported (always NULL).</p>
 <p>You generally do not want to change default values after they're initially defined. Fields that have the default value are not actually stored in the serialized data but are generated in code, so when you change the default, you'd now get a different value than from code generated from an older version of the schema. There are situations however where this may be desirable, especially if you can ensure a simultaneous rebuild of all code.</p>
 <h3>Enums</h3>
-<p>Define a sequence of named constants, each with a given value, or increasing by one from the previous one. The default first value is <code>0</code>. As you can see in the enum declaration, you specify the underlying integral type of the enum with <code>:</code> (in this case <code>byte</code>), which then determines the type of any fields declared with this enum type. If you omit the underlying type, it will be <code>short</code>.</p>
+<p>Define a sequence of named constants, each with a given value, or increasing by one from the previous one. The default first value is <code>0</code>. As you can see in the enum declaration, you specify the underlying integral type of the enum with <code>:</code> (in this case <code>byte</code>), which then determines the type of any fields declared with this enum type.</p>
 <h3>Unions</h3>
 <p>Unions share a lot of properties with enums, but instead of new names for constants, you use names of tables. You can then declare a union field which can hold a reference to any of those types, and additionally a hidden field with the suffix <code>_type</code> is generated that holds the corresponding enum value, allowing you to know which type to cast to at runtime.</p>
 <h3>Namespaces</h3>
@@ -119,15 +119,16 @@ root_type Monster;
 <p>Attributes may be attached to a declaration, behind a field, or after the name of a table/struct/enum/union. These may either have a value or not. Some attributes like <code>deprecated</code> are understood by the compiler, others are simply ignored (like <code>priority</code>), but are available to query if you parse the schema at runtime. This is useful if you write your own code generators/editors etc., and you wish to add additional information specific to your tool (such as a help text).</p>
 <p>Current understood attributes:</p>
 <ul>
+<li><code>id: n</code> (on a table field): manually set the field id to <code>n</code>. If you use this attribute, you must use it on ALL fields of this table, and the numbers must be a contiguous range from 0 onwards. Additionally, since a union type effectively adds two fields, its id must be that of the second field (the first field is the type field and not explicitly declared in the schema). Once you've added id's, you can now order fields in any order in the schema, though new fields must still use the next available id when added.</li>
 <li><code>deprecated</code> (on a field): do not generate accessors for this field anymore, code should stop using this data.</li>
 <li><code>original_order</code> (on a table): since elements in a table do not need to be stored in any particular order, they are often optimized for space by sorting them to size. This attribute stops that from happening.</li>
 <li><code>force_align: size</code> (on a struct): force the alignment of this struct to be something higher than what it is naturally aligned to. Causes these structs to be aligned to that amount inside a buffer, IF that buffer is allocated with that alignment (which is not necessarily the case for buffers accessed directly inside a <code>FlatBufferBuilder</code>).</li>
 </ul>
 <h2>Gotchas</h2>
 <h3>Schemas and version control</h3>
-<p>FlatBuffers relies on new field declarations being added at the end, and earlier declarations to not be removed, but be marked deprecated when needed. We think this is an improvement over the manual number assignment that happens in Protocol Buffers.</p>
+<p>FlatBuffers relies on new field declarations being added at the end, and earlier declarations to not be removed, but be marked deprecated when needed. We think this is an improvement over the manual number assignment that happens in Protocol Buffers (and which is still an option using the <code>id</code> attribute mentioned above).</p>
 <p>One place where this is possibly problematic however is source control. If user A adds a field, generates new binary data with this new schema, then tries to commit both to source control after user B already committed a new field also, and just auto-merges the schema, the binary files are now invalid compared to the new schema.</p>
-<p>The solution of course is that you should not be generating binary data before your schema changes have been committed, ensuring consistency with the rest of the world. </p>
+<p>The solution of course is that you should not be generating binary data before your schema changes have been committed, ensuring consistency with the rest of the world. If this is not practical for you, use explicit field ids, which should always generate a merge conflict if two people try to allocate the same id. </p>
 </div></div><!-- contents -->
 </div><!-- doc-content -->
 <!-- Google Analytics -->
diff --git a/docs/source/Schemas.md b/docs/source/Schemas.md
index 6e20289..debab67 100755
--- a/docs/source/Schemas.md
+++ b/docs/source/Schemas.md
@@ -51,6 +51,9 @@ and backwards compatibility. Note that:
     definition. Older data will still
     read correctly, and give you the default value when read. Older code
     will simply ignore the new field.
+    If you want to have flexibility to use any order for fields in your
+    schema, you can manually assign ids (much like Protocol Buffers),
+    see the `id` attribute below.
 
 -   You cannot delete fields you don't use anymore from the schema,
     but you can simply
@@ -164,6 +167,17 @@ help text).
 
 Current understood attributes:
 
+-   `id: n` (on a table field): manually set the field identifier to `n`.
+    If you use this attribute, you must use it on ALL fields of this table,
+    and the numbers must be a contiguous range from 0 onwards.
+    Additionally, since a union type effectively adds two fields, its
+    id must be that of the second field (the first field is the type
+    field and not explicitly declared in the schema).
+    For example, if the last field before the union field had id 6,
+    the union field should have id 8, and the unions type field will
+    implicitly be 7.
+    IDs allow the fields to be placed in any order in the schema.
+    When a new field is added to the schema is must use the next available ID.
 -   `deprecated` (on a field): do not generate accessors for this field
     anymore, code should stop using this data.
 -   `original_order` (on a table): since elements in a table do not need
@@ -182,7 +196,8 @@ Current understood attributes:
 FlatBuffers relies on new field declarations being added at the end, and earlier
 declarations to not be removed, but be marked deprecated when needed. We think
 this is an improvement over the manual number assignment that happens in
-Protocol Buffers.
+Protocol Buffers (and which is still an option using the `id` attribute
+mentioned above).
 
 One place where this is possibly problematic however is source control. If user
 A adds a field, generates new binary data with this new schema, then tries to
@@ -192,5 +207,7 @@ the new schema.
 
 The solution of course is that you should not be generating binary data before
 your schema changes have been committed, ensuring consistency with the rest of
-the world.
+the world. If this is not practical for you, use explicit field ids, which
+should always generate a merge conflict if two people try to allocate the same
+id.
 
diff --git a/src/flatc.cpp b/src/flatc.cpp
index ddd275b..57d7fed 100755
--- a/src/flatc.cpp
+++ b/src/flatc.cpp
@@ -18,7 +18,8 @@
 #include "flatbuffers/idl.h"
 #include "flatbuffers/util.h"
 
-void Error(const char *err, const char *obj = nullptr, bool usage = false);
+static void Error(const char *err, const char *obj = nullptr,
+                  bool usage = false);
 
 namespace flatbuffers {
 
@@ -72,7 +73,7 @@ const Generator generators[] = {
 
 const char *program_name = NULL;
 
-void Error(const char *err, const char *obj, bool usage) {
+static void Error(const char *err, const char *obj, bool usage) {
   printf("%s: %s\n", program_name, err);
   if (obj) printf(": %s", obj);
   printf("\n");
diff --git a/src/idl_parser.cpp b/src/idl_parser.cpp
index 509052a..e16dc33 100644
--- a/src/idl_parser.cpp
+++ b/src/idl_parser.cpp
@@ -306,10 +306,12 @@ void Parser::ParseField(StructDef &struct_def) {
   if (struct_def.fixed && !IsScalar(type.base_type) && !IsStruct(type))
     Error("structs_ may contain only scalar or struct fields");
 
+  FieldDef *typefield = nullptr;
   if (type.base_type == BASE_TYPE_UNION) {
     // For union fields, add a second auto-generated field to hold the type,
     // with _type appended as the name.
-    AddField(struct_def, name + "_type", type.enum_def->underlying_type);
+    typefield = &AddField(struct_def, name + "_type",
+                          type.enum_def->underlying_type);
   }
 
   auto &field = AddField(struct_def, name, type);
@@ -325,6 +327,19 @@ void Parser::ParseField(StructDef &struct_def) {
   if (field.deprecated && struct_def.fixed)
     Error("can't deprecate fields in a struct");
 
+  if (typefield) {
+    // If this field is a union, and it has a manually assigned id,
+    // the automatically added type field should have an id as well (of N - 1).
+    auto attr = field.attributes.Lookup("id");
+    if (attr) {
+      auto id = atoi(attr->constant.c_str());
+      auto val = new Value();
+      val->type = attr->type;
+      val->constant = NumToString(id - 1);
+      typefield->attributes.Add("id", val);
+    }
+  }
+
   Expect(';');
 }
 
@@ -651,6 +666,35 @@ void Parser::ParseDecl() {
     struct_def.minalign = align;
   }
   struct_def.PadLastField(struct_def.minalign);
+  // Check if this is a table that has manual id assignments
+  auto &fields = struct_def.fields.vec;
+  if (!struct_def.fixed && fields.size()) {
+    int num_id_fields = 0;
+    for (auto it = fields.begin(); it != fields.end(); ++it) {
+      if ((*it)->attributes.Lookup("id")) num_id_fields++;
+    }
+    // If any fields have ids..
+    if (num_id_fields) {
+      // Then all fields must have them.
+      if (num_id_fields != fields.size())
+        Error("either all fields or no fields must have an 'id' attribute");
+      // Simply sort by id, then the fields are the same as if no ids had
+      // been specified.
+      std::sort(fields.begin(), fields.end(),
+        [](const FieldDef *a, const FieldDef *b) -> bool {
+          auto a_id = atoi(a->attributes.Lookup("id")->constant.c_str());
+          auto b_id = atoi(b->attributes.Lookup("id")->constant.c_str());
+          return a_id < b_id;
+      });
+      // Verify we have a contiguous set, and reassign vtable offsets.
+      for (int i = 0; i < static_cast<int>(fields.size()); i++) {
+        if (i != atoi(fields[i]->attributes.Lookup("id")->constant.c_str()))
+          Error("field id\'s must be consecutive from 0, id " +
+                NumToString(i) + " missing or set twice");
+        fields[i]->value.offset = FieldIndexToOffset(static_cast<voffset_t>(i));
+      }
+    }
+  }
   Expect('}');
 }
 
diff --git a/tests/MyGame/Example/Monster.java b/tests/MyGame/Example/Monster.java
index d9c59bd..aa0bf7f 100755
--- a/tests/MyGame/Example/Monster.java
+++ b/tests/MyGame/Example/Monster.java
@@ -17,7 +17,6 @@ public class Monster extends Table {
   public String name() { int o = __offset(10); return o != 0 ? __string(o) : null; }
   public byte inventory(int j) { int o = __offset(14); return o != 0 ? bb.get(__vector(o) + j * 1) : 0; }
   public int inventoryLength() { int o = __offset(14); return o != 0 ? __vector_len(o) : 0; }
-  /// an example documentation comment: this will end up in the generated code multiline too
   public byte color() { int o = __offset(16); return o != 0 ? bb.get(o + bb_pos) : 2; }
   public byte testType() { int o = __offset(18); return o != 0 ? bb.get(o + bb_pos) : 0; }
   public Table test(Table obj) { int o = __offset(20); return o != 0 ? __union(obj, o) : null; }
@@ -26,6 +25,7 @@ public class Monster extends Table {
   public int test4Length() { int o = __offset(22); return o != 0 ? __vector_len(o) : 0; }
   public String testarrayofstring(int j) { int o = __offset(24); return o != 0 ? __string(__vector(o) + j * 4) : null; }
   public int testarrayofstringLength() { int o = __offset(24); return o != 0 ? __vector_len(o) : 0; }
+  /// an example documentation comment: this will end up in the generated code multiline too
   public Monster testarrayoftables(int j) { return testarrayoftables(new Monster(), j); }
   public Monster testarrayoftables(Monster obj, int j) { int o = __offset(26); return o != 0 ? obj.__init(__indirect(__vector(o) + j * 4), bb) : null; }
   public int testarrayoftablesLength() { int o = __offset(26); return o != 0 ? __vector_len(o) : 0; }
diff --git a/tests/monster_test.fbs b/tests/monster_test.fbs
index d3735e8..a9f95ea 100755
--- a/tests/monster_test.fbs
+++ b/tests/monster_test.fbs
@@ -18,19 +18,19 @@ struct Vec3 (force_align: 16) {
 }
 
 table Monster {
-  pos:Vec3;
-  mana:short = 150;
-  hp:short = 100;
-  name:string;
-  friendly:bool = false (deprecated, priority: 1);
-  inventory:[ubyte];
+  pos:Vec3 (id: 0);
+  hp:short = 100 (id: 2);
+  mana:short = 150 (id: 1);
+  name:string (id: 3);
+  color:Color = Blue (id: 6);
+  inventory:[ubyte] (id: 5);
+  friendly:bool = false (deprecated, priority: 1, id: 4);
   /// an example documentation comment: this will end up in the generated code
   /// multiline too
-  color:Color = Blue;
-  test:Any;
-  test4:[Test];
-  testarrayofstring:[string];
-  testarrayoftables:[Monster];
+  testarrayoftables:[Monster] (id: 11);
+  testarrayofstring:[string] (id: 10);
+  test:Any (id: 8);
+  test4:[Test] (id: 9);
 }
 
 root_type Monster;
diff --git a/tests/monster_test_generated.h b/tests/monster_test_generated.h
index 7601dc9..627cb1f 100755
--- a/tests/monster_test_generated.h
+++ b/tests/monster_test_generated.h
@@ -85,12 +85,12 @@ struct Monster : private flatbuffers::Table {
   int16_t hp() const { return GetField<int16_t>(8, 100); }
   const flatbuffers::String *name() const { return GetPointer<const flatbuffers::String *>(10); }
   const flatbuffers::Vector<uint8_t> *inventory() const { return GetPointer<const flatbuffers::Vector<uint8_t> *>(14); }
-  /// an example documentation comment: this will end up in the generated code multiline too
   int8_t color() const { return GetField<int8_t>(16, 2); }
   uint8_t test_type() const { return GetField<uint8_t>(18, 0); }
   const void *test() const { return GetPointer<const void *>(20); }
   const flatbuffers::Vector<const Test *> *test4() const { return GetPointer<const flatbuffers::Vector<const Test *> *>(22); }
   const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *testarrayofstring() const { return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>> *>(24); }
+  /// an example documentation comment: this will end up in the generated code multiline too
   const flatbuffers::Vector<flatbuffers::Offset<Monster>> *testarrayoftables() const { return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Monster>> *>(26); }
   bool Verify(const flatbuffers::Verifier &verifier) const {
     return VerifyTable(verifier) &&
-- 
2.7.4