libceph: pg_upmap[_items] infrastructure
authorIlya Dryomov <idryomov@gmail.com>
Wed, 21 Jun 2017 15:27:18 +0000 (17:27 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Fri, 7 Jul 2017 15:25:18 +0000 (17:25 +0200)
pg_temp and pg_upmap encodings are the same (PG -> array of osds),
except for the incremental remove: it's an empty mapping in new_pg_temp
for pg_temp and a separate old_pg_upmap set for pg_upmap.  (This isn't
to allow for empty pg_upmap mappings -- apparently, pg_temp just wasn't
looked at as an example for pg_upmap encoding.)

Reuse __decode_pg_temp() for decoding pg_upmap and new_pg_upmap.
__decode_pg_temp() stores into pg_temp union member, but since pg_upmap
union member is identical, reading through pg_upmap later is OK.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
include/linux/ceph/osdmap.h
net/ceph/debugfs.c
net/ceph/osdmap.c

index fe6d189..c612cff 100644 (file)
@@ -143,10 +143,14 @@ struct ceph_pg_mapping {
                struct {
                        int len;
                        int osds[];
-               } pg_temp;
+               } pg_temp, pg_upmap;
                struct {
                        int osd;
                } primary_temp;
+               struct {
+                       int len;
+                       int from_to[][2];
+               } pg_upmap_items;
        };
 };
 
@@ -165,6 +169,10 @@ struct ceph_osdmap {
        struct rb_root pg_temp;
        struct rb_root primary_temp;
 
+       /* remap (post-CRUSH, pre-up) */
+       struct rb_root pg_upmap;        /* PG := raw set */
+       struct rb_root pg_upmap_items;  /* from -> to within raw set */
+
        u32 *osd_primary_affinity;
 
        struct rb_root pg_pools;
index 017f15c..4f57d5b 100644 (file)
@@ -104,6 +104,29 @@ static int osdmap_show(struct seq_file *s, void *p)
                seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
                           pg->pgid.seed, pg->primary_temp.osd);
        }
+       for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) {
+               struct ceph_pg_mapping *pg =
+                       rb_entry(n, struct ceph_pg_mapping, node);
+
+               seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool,
+                          pg->pgid.seed);
+               for (i = 0; i < pg->pg_upmap.len; i++)
+                       seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+                                  pg->pg_upmap.osds[i]);
+               seq_printf(s, "]\n");
+       }
+       for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) {
+               struct ceph_pg_mapping *pg =
+                       rb_entry(n, struct ceph_pg_mapping, node);
+
+               seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool,
+                          pg->pgid.seed);
+               for (i = 0; i < pg->pg_upmap_items.len; i++)
+                       seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","),
+                                  pg->pg_upmap_items.from_to[i][0],
+                                  pg->pg_upmap_items.from_to[i][1]);
+               seq_printf(s, "]\n");
+       }
 
        up_read(&osdc->lock);
        return 0;
index f6d561e..a3f60d0 100644 (file)
@@ -735,6 +735,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
        map->pool_max = -1;
        map->pg_temp = RB_ROOT;
        map->primary_temp = RB_ROOT;
+       map->pg_upmap = RB_ROOT;
+       map->pg_upmap_items = RB_ROOT;
        mutex_init(&map->crush_workspace_mutex);
 
        return map;
@@ -759,6 +761,20 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
                erase_pg_mapping(&map->primary_temp, pg);
                free_pg_mapping(pg);
        }
+       while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
+               struct ceph_pg_mapping *pg =
+                       rb_entry(rb_first(&map->pg_upmap),
+                                struct ceph_pg_mapping, node);
+               rb_erase(&pg->node, &map->pg_upmap);
+               kfree(pg);
+       }
+       while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
+               struct ceph_pg_mapping *pg =
+                       rb_entry(rb_first(&map->pg_upmap_items),
+                                struct ceph_pg_mapping, node);
+               rb_erase(&pg->node, &map->pg_upmap_items);
+               kfree(pg);
+       }
        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
                struct ceph_pg_pool_info *pi =
                        rb_entry(rb_first(&map->pg_pools),
@@ -1161,6 +1177,75 @@ e_inval:
        return -EINVAL;
 }
 
+static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
+                                                bool __unused)
+{
+       return __decode_pg_temp(p, end, false);
+}
+
+static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+       return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
+                                false);
+}
+
+static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+       return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
+                                true);
+}
+
+static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
+{
+       return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
+}
+
+static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
+                                                      bool __unused)
+{
+       struct ceph_pg_mapping *pg;
+       u32 len, i;
+
+       ceph_decode_32_safe(p, end, len, e_inval);
+       if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
+               return ERR_PTR(-EINVAL);
+
+       ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
+       pg = kzalloc(sizeof(*pg) + 2 * len * sizeof(u32), GFP_NOIO);
+       if (!pg)
+               return ERR_PTR(-ENOMEM);
+
+       pg->pg_upmap_items.len = len;
+       for (i = 0; i < len; i++) {
+               pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
+               pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
+       }
+
+       return pg;
+
+e_inval:
+       return ERR_PTR(-EINVAL);
+}
+
+static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
+{
+       return decode_pg_mapping(p, end, &map->pg_upmap_items,
+                                __decode_pg_upmap_items, false);
+}
+
+static int decode_new_pg_upmap_items(void **p, void *end,
+                                    struct ceph_osdmap *map)
+{
+       return decode_pg_mapping(p, end, &map->pg_upmap_items,
+                                __decode_pg_upmap_items, true);
+}
+
+static int decode_old_pg_upmap_items(void **p, void *end,
+                                    struct ceph_osdmap *map)
+{
+       return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
+}
+
 /*
  * decode a full map.
  */
@@ -1250,9 +1335,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
                if (err)
                        goto bad;
        } else {
-               /* XXX can this happen? */
-               kfree(map->osd_primary_affinity);
-               map->osd_primary_affinity = NULL;
+               WARN_ON(map->osd_primary_affinity);
        }
 
        /* crush */
@@ -1261,6 +1344,26 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
        if (err)
                goto bad;
 
+       *p += len;
+       if (struct_v >= 3) {
+               /* erasure_code_profiles */
+               ceph_decode_skip_map_of_map(p, end, string, string, string,
+                                           bad);
+       }
+
+       if (struct_v >= 4) {
+               err = decode_pg_upmap(p, end, map);
+               if (err)
+                       goto bad;
+
+               err = decode_pg_upmap_items(p, end, map);
+               if (err)
+                       goto bad;
+       } else {
+               WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
+               WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
+       }
+
        /* ignore the rest */
        *p = end;
 
@@ -1520,6 +1623,32 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                        goto bad;
        }
 
+       if (struct_v >= 3) {
+               /* new_erasure_code_profiles */
+               ceph_decode_skip_map_of_map(p, end, string, string, string,
+                                           bad);
+               /* old_erasure_code_profiles */
+               ceph_decode_skip_set(p, end, string, bad);
+       }
+
+       if (struct_v >= 4) {
+               err = decode_new_pg_upmap(p, end, map);
+               if (err)
+                       goto bad;
+
+               err = decode_old_pg_upmap(p, end, map);
+               if (err)
+                       goto bad;
+
+               err = decode_new_pg_upmap_items(p, end, map);
+               if (err)
+                       goto bad;
+
+               err = decode_old_pg_upmap_items(p, end, map);
+               if (err)
+                       goto bad;
+       }
+
        /* ignore the rest */
        *p = end;