libceph: introduce ceph_osd_request_target, calc_target()

author Ilya Dryomov <idryomov@gmail.com>

Thu, 28 Apr 2016 14:07:23 +0000 (16:07 +0200)

committer Ilya Dryomov <idryomov@gmail.com>

Wed, 25 May 2016 22:36:26 +0000 (00:36 +0200)
author Ilya Dryomov <idryomov@gmail.com>
Thu, 28 Apr 2016 14:07:23 +0000 (16:07 +0200)
committer Ilya Dryomov <idryomov@gmail.com>
Wed, 25 May 2016 22:36:26 +0000 (00:36 +0200)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c

index 6f28dd9..c5d7548 100644 (file)
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1774,7 +1774,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
         wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
                           CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
         osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
-       wr_req->r_base_oloc.pool = pool;
+       ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
         ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
  
         err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c

index 9d47039..36b4a41 100644 (file)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -714,7 +714,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
         req->r_flags =  CEPH_OSD_FLAG_ORDERSNAP |
                         CEPH_OSD_FLAG_ONDISK |
                         CEPH_OSD_FLAG_WRITE;
-       req->r_base_oloc = orig_req->r_base_oloc;
+       ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
         ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
  
         ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h

index 63854a8..48806ee 100644 (file)
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -24,6 +24,8 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
                                      struct ceph_msg *);
  typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
  
+#define CEPH_HOMELESS_OSD      -1
+
  /* a given osd we're communicating with */
  struct ceph_osd {
         atomic_t o_ref;
@@ -118,6 +120,27 @@ struct ceph_osd_req_op {
         };
  };
  
+struct ceph_osd_request_target {
+       struct ceph_object_id base_oid;
+       struct ceph_object_locator base_oloc;
+       struct ceph_object_id target_oid;
+       struct ceph_object_locator target_oloc;
+
+       struct ceph_pg pgid;
+       u32 pg_num;
+       u32 pg_num_mask;
+       struct ceph_osds acting;
+       struct ceph_osds up;
+       int size;
+       int min_size;
+       bool sort_bitwise;
+
+       unsigned int flags;                /* CEPH_OSD_FLAG_* */
+       bool paused;
+
+       int osd;
+};
+
  /* an in-flight request */
  struct ceph_osd_request {
         u64             r_tid;              /* unique for this client */
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h

index 989294d..420bb79 100644 (file)
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -28,6 +28,7 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
  
  #define CEPH_POOL_FLAG_HASHPSPOOL      (1ULL << 0) /* hash pg seed and pool id
                                                        together */
+#define CEPH_POOL_FLAG_FULL            (1ULL << 1) /* pool is full */
  
  struct ceph_pg_pool_info {
         struct rb_node node;
@@ -62,6 +63,22 @@ struct ceph_object_locator {
         s64 pool;
  };
  
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+       oloc->pool = -1;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+       return oloc->pool == -1;
+}
+
+static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
+                                 const struct ceph_object_locator *src)
+{
+       dest->pool = src->pool;
+}
+
  /*
   * Maximum supported by kernel client object name length
   *
@@ -227,6 +244,23 @@ static inline void ceph_osds_init(struct ceph_osds *set)
  
  void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
  
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                         const struct ceph_osds *new_acting,
+                         const struct ceph_osds *old_up,
+                         const struct ceph_osds *new_up,
+                         int old_size,
+                         int new_size,
+                         int old_min_size,
+                         int new_min_size,
+                         u32 old_pg_num,
+                         u32 new_pg_num,
+                         bool old_sort_bitwise,
+                         bool new_sort_bitwise,
+                         const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                      const struct ceph_osds *new_acting,
+                      bool any_change);
+
  /* calculate mapping of a file extent to an object */
  extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
                                          u64 off, u64 len,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h

index 913c87c..f28ed86 100644 (file)
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
  #define CEPH_OSDMAP_NOIN     (1<<8)  /* block osd auto mark-in */
  #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
  #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB  (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
  
  /*
   * The error code to return when an OSD can't handle a write
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c

index 0ff400a..cff3a7e 100644 (file)
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -299,6 +299,30 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
  }
  
  /*
+ * Assumes @t is zero-initialized.
+ */
+static void target_init(struct ceph_osd_request_target *t)
+{
+       ceph_oid_init(&t->base_oid);
+       ceph_oloc_init(&t->base_oloc);
+       ceph_oid_init(&t->target_oid);
+       ceph_oloc_init(&t->target_oloc);
+
+       ceph_osds_init(&t->acting);
+       ceph_osds_init(&t->up);
+       t->size = -1;
+       t->min_size = -1;
+
+       t->osd = CEPH_HOMELESS_OSD;
+}
+
+static void target_destroy(struct ceph_osd_request_target *t)
+{
+       ceph_oid_destroy(&t->base_oid);
+       ceph_oid_destroy(&t->target_oid);
+}
+
+/*
   * requests
   */
  static void ceph_osdc_release_request(struct kref *kref)
@@ -1273,6 +1297,11 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
  }
  EXPORT_SYMBOL(ceph_osdc_set_request_linger);
  
+static bool __pool_full(struct ceph_pg_pool_info *pi)
+{
+       return pi->flags & CEPH_POOL_FLAG_FULL;
+}
+
  /*
   * Returns whether a request should be blocked from being sent
   * based on the current osdmap and osd_client settings.
@@ -1289,6 +1318,20 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc,
                 (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
  }
  
+static bool target_should_be_paused(struct ceph_osd_client *osdc,
+                                   const struct ceph_osd_request_target *t,
+                                   struct ceph_pg_pool_info *pi)
+{
+       bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+       bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+                      ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+                      __pool_full(pi);
+
+       WARN_ON(pi->id != t->base_oloc.pool);
+       return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
+              (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
  /*
   * Calculate mapping of a request to a PG.  Takes tiering into account.
   */
@@ -1328,6 +1371,116 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap,
                                          &req->r_target_oloc, pg_out);
  }
  
+enum calc_target_result {
+       CALC_TARGET_NO_ACTION = 0,
+       CALC_TARGET_NEED_RESEND,
+       CALC_TARGET_POOL_DNE,
+};
+
+static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
+                                          struct ceph_osd_request_target *t,
+                                          u32 *last_force_resend,
+                                          bool any_change)
+{
+       struct ceph_pg_pool_info *pi;
+       struct ceph_pg pgid, last_pgid;
+       struct ceph_osds up, acting;
+       bool force_resend = false;
+       bool need_check_tiering = false;
+       bool need_resend = false;
+       bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
+                                            CEPH_OSDMAP_SORTBITWISE);
+       enum calc_target_result ct_res;
+       int ret;
+
+       pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
+       if (!pi) {
+               t->osd = CEPH_HOMELESS_OSD;
+               ct_res = CALC_TARGET_POOL_DNE;
+               goto out;
+       }
+
+       if (osdc->osdmap->epoch == pi->last_force_request_resend) {
+               if (last_force_resend &&
+                   *last_force_resend < pi->last_force_request_resend) {
+                       *last_force_resend = pi->last_force_request_resend;
+                       force_resend = true;
+               } else if (!last_force_resend) {
+                       force_resend = true;
+               }
+       }
+       if (ceph_oid_empty(&t->target_oid) || force_resend) {
+               ceph_oid_copy(&t->target_oid, &t->base_oid);
+               need_check_tiering = true;
+       }
+       if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
+               ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
+               need_check_tiering = true;
+       }
+
+       if (need_check_tiering &&
+           (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+               if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+                       t->target_oloc.pool = pi->read_tier;
+               if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+                       t->target_oloc.pool = pi->write_tier;
+       }
+
+       ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
+                                       &t->target_oloc, &pgid);
+       if (ret) {
+               WARN_ON(ret != -ENOENT);
+               t->osd = CEPH_HOMELESS_OSD;
+               ct_res = CALC_TARGET_POOL_DNE;
+               goto out;
+       }
+       last_pgid.pool = pgid.pool;
+       last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
+
+       ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
+       if (any_change &&
+           ceph_is_new_interval(&t->acting,
+                                &acting,
+                                &t->up,
+                                &up,
+                                t->size,
+                                pi->size,
+                                t->min_size,
+                                pi->min_size,
+                                t->pg_num,
+                                pi->pg_num,
+                                t->sort_bitwise,
+                                sort_bitwise,
+                                &last_pgid))
+               force_resend = true;
+
+       if (t->paused && !target_should_be_paused(osdc, t, pi)) {
+               t->paused = false;
+               need_resend = true;
+       }
+
+       if (ceph_pg_compare(&t->pgid, &pgid) ||
+           ceph_osds_changed(&t->acting, &acting, any_change) ||
+           force_resend) {
+               t->pgid = pgid; /* struct */
+               ceph_osds_copy(&t->acting, &acting);
+               ceph_osds_copy(&t->up, &up);
+               t->size = pi->size;
+               t->min_size = pi->min_size;
+               t->pg_num = pi->pg_num;
+               t->pg_num_mask = pi->pg_num_mask;
+               t->sort_bitwise = sort_bitwise;
+
+               t->osd = acting.primary;
+               need_resend = true;
+       }
+
+       ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
+out:
+       dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+       return ct_res;
+}
+
  static void __enqueue_request(struct ceph_osd_request *req)
  {
         struct ceph_osd_client *osdc = req->r_osdc;
@@ -1805,12 +1958,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                 redir.oloc.pool = -1;
         }
  
-       if (redir.oloc.pool != -1) {
+       if (!ceph_oloc_empty(&redir.oloc)) {
                 dout("redirect pool %lld\n", redir.oloc.pool);
  
                 __unregister_request(osdc, req);
  
-               req->r_target_oloc = redir.oloc; /* struct */
+               ceph_oloc_copy(&req->r_target_oloc, &redir.oloc);
  
                 /*
                  * Start redirect requests with nofail=true.  If
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c

index 66c3ebe..7d4a5b4 100644 (file)
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1521,6 +1521,32 @@ void ceph_oid_destroy(struct ceph_object_id *oid)
  }
  EXPORT_SYMBOL(ceph_oid_destroy);
  
+/*
+ * osds only
+ */
+static bool __osds_equal(const struct ceph_osds *lhs,
+                        const struct ceph_osds *rhs)
+{
+       if (lhs->size == rhs->size &&
+           !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
+               return true;
+
+       return false;
+}
+
+/*
+ * osds + primary
+ */
+static bool osds_equal(const struct ceph_osds *lhs,
+                      const struct ceph_osds *rhs)
+{
+       if (__osds_equal(lhs, rhs) &&
+           lhs->primary == rhs->primary)
+               return true;
+
+       return false;
+}
+
  static bool osds_valid(const struct ceph_osds *set)
  {
         /* non-empty set */
@@ -1553,6 +1579,101 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
         dest->primary = src->primary;
  }
  
+static bool is_split(const struct ceph_pg *pgid,
+                    u32 old_pg_num,
+                    u32 new_pg_num)
+{
+       int old_bits = calc_bits_of(old_pg_num);
+       int old_mask = (1 << old_bits) - 1;
+       int n;
+
+       WARN_ON(pgid->seed >= old_pg_num);
+       if (new_pg_num <= old_pg_num)
+               return false;
+
+       for (n = 1; ; n++) {
+               int next_bit = n << (old_bits - 1);
+               u32 s = next_bit | pgid->seed;
+
+               if (s < old_pg_num || s == pgid->seed)
+                       continue;
+               if (s >= new_pg_num)
+                       break;
+
+               s = ceph_stable_mod(s, old_pg_num, old_mask);
+               if (s == pgid->seed)
+                       return true;
+       }
+
+       return false;
+}
+
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+                         const struct ceph_osds *new_acting,
+                         const struct ceph_osds *old_up,
+                         const struct ceph_osds *new_up,
+                         int old_size,
+                         int new_size,
+                         int old_min_size,
+                         int new_min_size,
+                         u32 old_pg_num,
+                         u32 new_pg_num,
+                         bool old_sort_bitwise,
+                         bool new_sort_bitwise,
+                         const struct ceph_pg *pgid)
+{
+       return !osds_equal(old_acting, new_acting) ||
+              !osds_equal(old_up, new_up) ||
+              old_size != new_size ||
+              old_min_size != new_min_size ||
+              is_split(pgid, old_pg_num, new_pg_num) ||
+              old_sort_bitwise != new_sort_bitwise;
+}
+
+static int calc_pg_rank(int osd, const struct ceph_osds *acting)
+{
+       int i;
+
+       for (i = 0; i < acting->size; i++) {
+               if (acting->osds[i] == osd)
+                       return i;
+       }
+
+       return -1;
+}
+
+static bool primary_changed(const struct ceph_osds *old_acting,
+                           const struct ceph_osds *new_acting)
+{
+       if (!old_acting->size && !new_acting->size)
+               return false; /* both still empty */
+
+       if (!old_acting->size ^ !new_acting->size)
+               return true; /* was empty, now not, or vice versa */
+
+       if (old_acting->primary != new_acting->primary)
+               return true; /* primary changed */
+
+       if (calc_pg_rank(old_acting->primary, old_acting) !=
+           calc_pg_rank(new_acting->primary, new_acting))
+               return true;
+
+       return false; /* same primary (tho replicas may have changed) */
+}
+
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+                      const struct ceph_osds *new_acting,
+                      bool any_change)
+{
+       if (primary_changed(old_acting, new_acting))
+               return true;
+
+       if (any_change && !__osds_equal(old_acting, new_acting))
+               return true;
+
+       return false;
+}
+
  /*
   * calculate file layout from given offset, length.
   * fill in correct oid, logical length, and object extent
author	Ilya Dryomov <idryomov@gmail.com>
	Thu, 28 Apr 2016 14:07:23 +0000 (16:07 +0200)
committer	Ilya Dryomov <idryomov@gmail.com>
	Wed, 25 May 2016 22:36:26 +0000 (00:36 +0200)
fs/ceph/addr.c		patch \| blob \| history
fs/ceph/file.c		patch \| blob \| history
include/linux/ceph/osd_client.h		patch \| blob \| history
include/linux/ceph/osdmap.h		patch \| blob \| history
include/linux/ceph/rados.h		patch \| blob \| history
net/ceph/osd_client.c		patch \| blob \| history
net/ceph/osdmap.c		patch \| blob \| history