libceph: support for balanced and localized reads

author Ilya Dryomov <idryomov@gmail.com>

Sat, 23 May 2020 09:45:48 +0000 (11:45 +0200)

committer Ilya Dryomov <idryomov@gmail.com>

Mon, 1 Jun 2020 11:22:53 +0000 (13:22 +0200)
author Ilya Dryomov <idryomov@gmail.com>
Sat, 23 May 2020 09:45:48 +0000 (11:45 +0200)
committer Ilya Dryomov <idryomov@gmail.com>
Mon, 1 Jun 2020 11:22:53 +0000 (13:22 +0200)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h

index 734f7c6..671fb93 100644 (file)
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -165,6 +165,7 @@ struct ceph_osd_request_target {
         bool recovery_deletes;
  
         unsigned int flags;                /* CEPH_OSD_FLAG_* */
+       bool used_replica;
         bool paused;
  
         u32 epoch;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h

index 8c9d18c..3f4498f 100644 (file)
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -317,6 +317,9 @@ int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
  int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
  void ceph_clear_crush_locs(struct rb_root *locs);
  
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+                           struct rb_root *locs);
+
  extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
                                                     u64 id);
  extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c

index 1344f23..409d505 100644 (file)
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -81,11 +81,13 @@ static int osdmap_show(struct seq_file *s, void *p)
                 u32 state = map->osd_state[i];
                 char sb[64];
  
-               seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
+               seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n",
                            i, ceph_pr_addr(addr),
                            ((map->osd_weight[i]*100) >> 16),
                            ceph_osdmap_state_str(sb, sizeof(sb), state),
-                          ((ceph_get_primary_affinity(map, i)*100) >> 16));
+                          ((ceph_get_primary_affinity(map, i)*100) >> 16),
+                          ceph_get_crush_locality(map, i,
+                                          &client->options->crush_locs));
         }
         for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
                 struct ceph_pg_mapping *pg =
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c

index ece124a..4ce6cdc 100644 (file)
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1497,6 +1497,45 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
                (osdc->osdmap->epoch < osdc->epoch_barrier);
  }
  
+static int pick_random_replica(const struct ceph_osds *acting)
+{
+       int i = prandom_u32() % acting->size;
+
+       dout("%s picked osd%d, primary osd%d\n", __func__,
+            acting->osds[i], acting->primary);
+       return i;
+}
+
+/*
+ * Picks the closest replica based on client's location given by
+ * crush_location option.  Prefers the primary if the locality is
+ * the same.
+ */
+static int pick_closest_replica(struct ceph_osd_client *osdc,
+                               const struct ceph_osds *acting)
+{
+       struct ceph_options *opt = osdc->client->options;
+       int best_i, best_locality;
+       int i = 0, locality;
+
+       do {
+               locality = ceph_get_crush_locality(osdc->osdmap,
+                                                  acting->osds[i],
+                                                  &opt->crush_locs);
+               if (i == 0 ||
+                   (locality >= 0 && best_locality < 0) ||
+                   (locality >= 0 && best_locality >= 0 &&
+                    locality < best_locality)) {
+                       best_i = i;
+                       best_locality = locality;
+               }
+       } while (++i < acting->size);
+
+       dout("%s picked osd%d with locality %d, primary osd%d\n", __func__,
+            acting->osds[best_i], best_locality, acting->primary);
+       return best_i;
+}
+
  enum calc_target_result {
         CALC_TARGET_NO_ACTION = 0,
         CALC_TARGET_NEED_RESEND,
@@ -1510,6 +1549,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
         struct ceph_pg_pool_info *pi;
         struct ceph_pg pgid, last_pgid;
         struct ceph_osds up, acting;
+       bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+       bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
         bool force_resend = false;
         bool unpaused = false;
         bool legacy_change = false;
@@ -1540,9 +1581,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
         ceph_oid_copy(&t->target_oid, &t->base_oid);
         ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
         if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-               if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+               if (is_read && pi->read_tier >= 0)
                         t->target_oloc.pool = pi->read_tier;
-               if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+               if (is_write && pi->write_tier >= 0)
                         t->target_oloc.pool = pi->write_tier;
  
                 pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
@@ -1581,7 +1622,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
                 unpaused = true;
         }
         legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
-                       ceph_osds_changed(&t->acting, &acting, any_change);
+                       ceph_osds_changed(&t->acting, &acting,
+                                         t->used_replica || any_change);
         if (t->pg_num)
                 split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
  
@@ -1597,7 +1639,24 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
                 t->sort_bitwise = sort_bitwise;
                 t->recovery_deletes = recovery_deletes;
  
-               t->osd = acting.primary;
+               if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+                                CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+                   !is_write && pi->type == CEPH_POOL_TYPE_REP &&
+                   acting.size > 1) {
+                       int pos;
+
+                       WARN_ON(!is_read || acting.osds[0] != acting.primary);
+                       if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) {
+                               pos = pick_random_replica(&acting);
+                       } else {
+                               pos = pick_closest_replica(osdc, &acting);
+                       }
+                       t->osd = acting.osds[pos];
+                       t->used_replica = pos > 0;
+               } else {
+                       t->osd = acting.primary;
+                       t->used_replica = false;
+               }
         }
  
         if (unpaused || legacy_change || force_resend || split)
@@ -3660,6 +3719,26 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
                 goto out_unlock_osdc;
         }
  
+       if (m.result == -EAGAIN) {
+               dout("req %p tid %llu EAGAIN\n", req, req->r_tid);
+               unlink_request(osd, req);
+               mutex_unlock(&osd->lock);
+
+               /*
+                * The object is missing on the replica or not (yet)
+                * readable.  Clear pgid to force a resend to the primary
+                * via legacy_change.
+                */
+               req->r_t.pgid.pool = 0;
+               req->r_t.pgid.seed = 0;
+               WARN_ON(!req->r_t.used_replica);
+               req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+                                 CEPH_OSD_FLAG_LOCALIZE_READS);
+               req->r_tid = 0;
+               __submit_request(req, false);
+               goto out_unlock_osdc;
+       }
+
         if (m.num_ops != req->r_num_ops) {
                 pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
                        req->r_num_ops, req->r_tid);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c

index 4b81334..96c25f5 100644 (file)
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2831,3 +2831,105 @@ void ceph_clear_crush_locs(struct rb_root *locs)
                 free_crush_loc(loc);
         }
  }
+
+/*
+ * [a-zA-Z0-9-_.]+
+ */
+static bool is_valid_crush_name(const char *name)
+{
+       do {
+               if (!('a' <= *name && *name <= 'z') &&
+                   !('A' <= *name && *name <= 'Z') &&
+                   !('0' <= *name && *name <= '9') &&
+                   *name != '-' && *name != '_' && *name != '.')
+                       return false;
+       } while (*++name != '\0');
+
+       return true;
+}
+
+/*
+ * Gets the parent of an item.  Returns its id (<0 because the
+ * parent is always a bucket), type id (>0 for the same reason,
+ * via @parent_type_id) and location (via @parent_loc).  If no
+ * parent, returns 0.
+ *
+ * Does a linear search, as there are no parent pointers of any
+ * kind.  Note that the result is ambigous for items that occur
+ * multiple times in the map.
+ */
+static int get_immediate_parent(struct crush_map *c, int id,
+                               u16 *parent_type_id,
+                               struct crush_loc *parent_loc)
+{
+       struct crush_bucket *b;
+       struct crush_name_node *type_cn, *cn;
+       int i, j;
+
+       for (i = 0; i < c->max_buckets; i++) {
+               b = c->buckets[i];
+               if (!b)
+                       continue;
+
+               /* ignore per-class shadow hierarchy */
+               cn = lookup_crush_name(&c->names, b->id);
+               if (!cn || !is_valid_crush_name(cn->cn_name))
+                       continue;
+
+               for (j = 0; j < b->size; j++) {
+                       if (b->items[j] != id)
+                               continue;
+
+                       *parent_type_id = b->type;
+                       type_cn = lookup_crush_name(&c->type_names, b->type);
+                       parent_loc->cl_type_name = type_cn->cn_name;
+                       parent_loc->cl_name = cn->cn_name;
+                       return b->id;
+               }
+       }
+
+       return 0;  /* no parent */
+}
+
+/*
+ * Calculates the locality/distance from an item to a client
+ * location expressed in terms of CRUSH hierarchy as a set of
+ * (bucket type name, bucket name) pairs.  Specifically, looks
+ * for the lowest-valued bucket type for which the location of
+ * @id matches one of the locations in @locs, so for standard
+ * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9)
+ * a matching host is closer than a matching rack and a matching
+ * data center is closer than a matching zone.
+ *
+ * Specifying multiple locations (a "multipath" location) such
+ * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs
+ * is a multimap.  The locality will be:
+ *
+ * - 3 for OSDs in racks foo1 and foo2
+ * - 8 for OSDs in data center bar
+ * - -1 for all other OSDs
+ *
+ * The lowest possible bucket type is 1, so the best locality
+ * for an OSD is 1 (i.e. a matching host).  Locality 0 would be
+ * the OSD itself.
+ */
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+                           struct rb_root *locs)
+{
+       struct crush_loc loc;
+       u16 type_id;
+
+       /*
+        * Instead of repeated get_immediate_parent() calls,
+        * the location of @id could be obtained with a single
+        * depth-first traversal.
+        */
+       for (;;) {
+               id = get_immediate_parent(osdmap->crush, id, &type_id, &loc);
+               if (id >= 0)
+                       return -1;  /* not local */
+
+               if (lookup_crush_loc(locs, &loc))
+                       return type_id;
+       }
+}
author	Ilya Dryomov <idryomov@gmail.com>
	Sat, 23 May 2020 09:45:48 +0000 (11:45 +0200)
committer	Ilya Dryomov <idryomov@gmail.com>
	Mon, 1 Jun 2020 11:22:53 +0000 (13:22 +0200)
include/linux/ceph/osd_client.h		patch \| blob \| history
include/linux/ceph/osdmap.h		patch \| blob \| history
net/ceph/debugfs.c		patch \| blob \| history
net/ceph/osd_client.c		patch \| blob \| history
net/ceph/osdmap.c		patch \| blob \| history