x86/UV2: Work around BAU bug

author Cliff Wickman <cpw@sgi.com>

Mon, 16 Jan 2012 21:19:47 +0000 (15:19 -0600)

committer Ingo Molnar <mingo@elte.hu>

Tue, 17 Jan 2012 08:09:54 +0000 (09:09 +0100)
author Cliff Wickman <cpw@sgi.com>
Mon, 16 Jan 2012 21:19:47 +0000 (15:19 -0600)
committer Ingo Molnar <mingo@elte.hu>
Tue, 17 Jan 2012 08:09:54 +0000 (09:09 +0100)
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h

index 4a46b27..1b82f7e 100644 (file)
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -167,6 +167,7 @@
  #define FLUSH_RETRY_TIMEOUT            2
  #define FLUSH_GIVEUP                   3
  #define FLUSH_COMPLETE                 4
+#define FLUSH_RETRY_BUSYBUG            5
  
  /*
   * tuning the action when the numalink network is extremely delayed
@@ -463,7 +464,6 @@ struct bau_pq_entry {
  struct msg_desc {
         struct bau_pq_entry     *msg;
         int                     msg_slot;
-       int                     swack_slot;
         struct bau_pq_entry     *queue_first;
         struct bau_pq_entry     *queue_last;
  };
@@ -517,6 +517,9 @@ struct ptc_stats {
         unsigned long   s_retry_messages;       /* retry broadcasts */
         unsigned long   s_bau_reenabled;        /* for bau enable/disable */
         unsigned long   s_bau_disabled;         /* for bau enable/disable */
+       unsigned long   s_uv2_wars;             /* uv2 workaround, perm. busy */
+       unsigned long   s_uv2_wars_hw;          /* uv2 workaround, hiwater */
+       unsigned long   s_uv2_war_waits;        /* uv2 workaround, long waits */
         /* destination statistics */
         unsigned long   d_alltlb;               /* times all tlb's on this
                                                    cpu were flushed */
@@ -593,6 +596,8 @@ struct bau_control {
         short                   cpus_in_socket;
         short                   cpus_in_uvhub;
         short                   partition_base_pnode;
+       short                   using_desc; /* an index, like uvhub_cpu */
+       unsigned int            inuse_map;
         unsigned short          message_number;
         unsigned short          uvhub_quiesce;
         short                   socket_acknowledge_count[DEST_Q_SIZE];
@@ -610,6 +615,7 @@ struct bau_control {
         int                     cong_response_us;
         int                     cong_reps;
         int                     cong_period;
+       unsigned long           clocks_per_100_usec;
         cycles_t                period_time;
         long                    period_requests;
         struct hub_and_pnode    *thp;
@@ -670,6 +676,11 @@ static inline void write_mmr_sw_ack(unsigned long mr)
         uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
  }
  
+static inline void write_gmmr_sw_ack(int pnode, unsigned long mr)
+{
+       write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
  static inline unsigned long read_mmr_sw_ack(void)
  {
         return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c

index c425ff1..9010ca7 100644 (file)
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub)
   * clear of the Timeout bit (as well) will free the resource. No reply will
   * be sent (the hardware will only do one reply per message).
   */
-static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
+static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
+                                               int do_acknowledge)
  {
         unsigned long dw;
         struct bau_pq_entry *msg;
  
         msg = mdp->msg;
-       if (!msg->canceled) {
+       if (!msg->canceled && do_acknowledge) {
                 dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
                 write_mmr_sw_ack(dw);
         }
@@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
                         if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
                                 unsigned long mr;
                                 /*
-                                * is the resource timed out?
-                                * make everyone ignore the cancelled message.
+                                * Is the resource timed out?
+                                * Make everyone ignore the cancelled message.
                                  */
                                 msg2->canceled = 1;
                                 stat->d_canceled++;
@@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
   * Do all the things a cpu should do for a TLB shootdown message.
   * Other cpu's may come here at the same time for this message.
   */
-static void bau_process_message(struct msg_desc *mdp,
-                                       struct bau_control *bcp)
+static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
+                                               int do_acknowledge)
  {
         short socket_ack_count = 0;
         short *sp;
@@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp,
                 if (msg_ack_count == bcp->cpus_in_uvhub) {
                         /*
                          * All cpus in uvhub saw it; reply
+                        * (unless we are in the UV2 workaround)
                          */
-                       reply_to_message(mdp, bcp);
+                       reply_to_message(mdp, bcp, do_acknowledge);
                 }
         }
  
@@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
  /*
   * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
   */
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
+static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
  {
         unsigned long descriptor_status;
         unsigned long descriptor_status2;
  
         descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
-       descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
+       descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL;
         descriptor_status = (descriptor_status << 1) | descriptor_status2;
         return descriptor_status;
  }
  
+/*
+ * Return whether the status of the descriptor that is normally used for this
+ * cpu (the one indexed by its hub-relative cpu number) is busy.
+ * The status of the original 32 descriptors is always reflected in the 64
+ * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0.
+ * The bit provided by the activation_status_2 register is irrelevant to
+ * the status if it is only being tested for busy or not busy.
+ */
+int normal_busy(struct bau_control *bcp)
+{
+       int cpu = bcp->uvhub_cpu;
+       int mmr_offset;
+       int right_shift;
+
+       mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+       right_shift = cpu * UV_ACT_STATUS_SIZE;
+       return (((((read_lmmr(mmr_offset) >> right_shift) &
+                               UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY);
+}
+
+/*
+ * Entered when a bau descriptor has gone into a permanent busy wait because
+ * of a hardware bug.
+ * Workaround the bug.
+ */
+int handle_uv2_busy(struct bau_control *bcp)
+{
+       int busy_one = bcp->using_desc;
+       int normal = bcp->uvhub_cpu;
+       int selected = -1;
+       int i;
+       unsigned long descriptor_status;
+       unsigned long status;
+       int mmr_offset;
+       struct bau_desc *bau_desc_old;
+       struct bau_desc *bau_desc_new;
+       struct bau_control *hmaster = bcp->uvhub_master;
+       struct ptc_stats *stat = bcp->statp;
+       cycles_t ttm;
+
+       stat->s_uv2_wars++;
+       spin_lock(&hmaster->uvhub_lock);
+       /* try for the original first */
+       if (busy_one != normal) {
+               if (!normal_busy(bcp))
+                       selected = normal;
+       }
+       if (selected < 0) {
+               /* can't use the normal, select an alternate */
+               mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+               descriptor_status = read_lmmr(mmr_offset);
+
+               /* scan available descriptors 32-63 */
+               for (i = 0; i < UV_CPUS_PER_AS; i++) {
+                       if ((hmaster->inuse_map & (1 << i)) == 0) {
+                               status = ((descriptor_status >>
+                                               (i * UV_ACT_STATUS_SIZE)) &
+                                               UV_ACT_STATUS_MASK) << 1;
+                               if (status != UV2H_DESC_BUSY) {
+                                       selected = i + UV_CPUS_PER_AS;
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       if (busy_one != normal)
+               /* mark the busy alternate as not in-use */
+               hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
+
+       if (selected >= 0) {
+               /* switch to the selected descriptor */
+               if (selected != normal) {
+                       /* set the selected alternate as in-use */
+                       hmaster->inuse_map |=
+                                       (1 << (selected - UV_CPUS_PER_AS));
+                       if (selected > stat->s_uv2_wars_hw)
+                               stat->s_uv2_wars_hw = selected;
+               }
+               bau_desc_old = bcp->descriptor_base;
+               bau_desc_old += (ITEMS_PER_DESC * busy_one);
+               bcp->using_desc = selected;
+               bau_desc_new = bcp->descriptor_base;
+               bau_desc_new += (ITEMS_PER_DESC * selected);
+               *bau_desc_new = *bau_desc_old;
+       } else {
+               /*
+                * All are busy. Wait for the normal one for this cpu to
+                * free up.
+                */
+               stat->s_uv2_war_waits++;
+               spin_unlock(&hmaster->uvhub_lock);
+               ttm = get_cycles();
+               do {
+                       cpu_relax();
+               } while (normal_busy(bcp));
+               spin_lock(&hmaster->uvhub_lock);
+               /* switch to the original descriptor */
+               bcp->using_desc = normal;
+               bau_desc_old = bcp->descriptor_base;
+               bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
+               bcp->using_desc = (ITEMS_PER_DESC * normal);
+               bau_desc_new = bcp->descriptor_base;
+               bau_desc_new += (ITEMS_PER_DESC * normal);
+               *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
+       }
+       spin_unlock(&hmaster->uvhub_lock);
+       return FLUSH_RETRY_BUSYBUG;
+}
+
  static int uv2_wait_completion(struct bau_desc *bau_desc,
                                 unsigned long mmr_offset, int right_shift,
                                 struct bau_control *bcp, long try)
  {
         unsigned long descriptor_stat;
         cycles_t ttm;
-       int cpu = bcp->uvhub_cpu;
+       int desc = bcp->using_desc;
+       long busy_reps = 0;
         struct ptc_stats *stat = bcp->statp;
  
-       descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+       descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);
  
         /* spin on the status MMR, waiting for it to go idle */
         while (descriptor_stat != UV2H_DESC_IDLE) {
@@ -542,12 +655,23 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
                         bcp->conseccompletes = 0;
                         return FLUSH_RETRY_TIMEOUT;
                 } else {
+                       busy_reps++;
+                       if (busy_reps > 1000000) {
+                               /* not to hammer on the clock */
+                               busy_reps = 0;
+                               ttm = get_cycles();
+                               if ((ttm - bcp->send_message) >
+                                       (bcp->clocks_per_100_usec)) {
+                                       return handle_uv2_busy(bcp);
+                               }
+                       }
                         /*
                          * descriptor_stat is still BUSY
                          */
                         cpu_relax();
                 }
-               descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+               descriptor_stat = uv2_read_status(mmr_offset, right_shift,
+                                                                       desc);
         }
         bcp->conseccompletes++;
         return FLUSH_COMPLETE;
@@ -563,14 +687,14 @@ static int wait_completion(struct bau_desc *bau_desc,
  {
         int right_shift;
         unsigned long mmr_offset;
-       int cpu = bcp->uvhub_cpu;
+       int desc = bcp->using_desc;
  
-       if (cpu < UV_CPUS_PER_AS) {
+       if (desc < UV_CPUS_PER_AS) {
                 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
-               right_shift = cpu * UV_ACT_STATUS_SIZE;
+               right_shift = desc * UV_ACT_STATUS_SIZE;
         } else {
                 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
-               right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
+               right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
         }
  
         if (bcp->uvhub_version == 1)
@@ -752,8 +876,7 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
   * Returns 1 if it gives up entirely and the original cpu mask is to be
   * returned to the kernel.
   */
-int uv_flush_send_and_wait(struct bau_desc *bau_desc,
-                       struct cpumask *flush_mask, struct bau_control *bcp)
+int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
  {
         int seq_number = 0;
         int completion_stat = 0;
@@ -766,20 +889,24 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
         struct bau_control *hmaster = bcp->uvhub_master;
         struct uv1_bau_msg_header *uv1_hdr = NULL;
         struct uv2_bau_msg_header *uv2_hdr = NULL;
+       struct bau_desc *bau_desc;
  
-       if (bcp->uvhub_version == 1) {
-               uv1 = 1;
+       if (bcp->uvhub_version == 1)
                 uv1_throttle(hmaster, stat);
-               uv1_hdr = &bau_desc->header.uv1_hdr;
-       } else
-               uv2_hdr = &bau_desc->header.uv2_hdr;
  
         while (hmaster->uvhub_quiesce)
                 cpu_relax();
  
         time1 = get_cycles();
         do {
-               if (try == 0) {
+               bau_desc = bcp->descriptor_base;
+               bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
+               if (bcp->uvhub_version == 1) {
+                       uv1 = 1;
+                       uv1_hdr = &bau_desc->header.uv1_hdr;
+               } else
+                       uv2_hdr = &bau_desc->header.uv2_hdr;
+               if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
                         if (uv1)
                                 uv1_hdr->msg_type = MSG_REGULAR;
                         else
@@ -797,13 +924,14 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
                         uv1_hdr->sequence = seq_number;
                 else
                         uv2_hdr->sequence = seq_number;
-               index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
+               index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc;
                 bcp->send_message = get_cycles();
  
                 write_mmr_activation(index);
  
                 try++;
                 completion_stat = wait_completion(bau_desc, bcp, try);
+               /* UV2: wait_completion() may change the bcp->using_desc */
  
                 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
  
@@ -814,6 +942,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
                 }
                 cpu_relax();
         } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
+                (completion_stat == FLUSH_RETRY_BUSYBUG) ||
                  (completion_stat == FLUSH_RETRY_TIMEOUT));
  
         time2 = get_cycles();
@@ -828,6 +957,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
         record_send_stats(time1, time2, bcp, stat, completion_stat, try);
  
         if (completion_stat == FLUSH_GIVEUP)
+               /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
                 return 1;
         return 0;
  }
@@ -983,7 +1113,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
                 stat->s_ntargself++;
  
         bau_desc = bcp->descriptor_base;
-       bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
+       bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
         bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
         if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
                 return NULL;
@@ -996,13 +1126,86 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
          * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
          * or 1 if it gave up and the original cpumask should be returned.
          */
-       if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+       if (!uv_flush_send_and_wait(flush_mask, bcp))
                 return NULL;
         else
                 return cpumask;
  }
  
  /*
+ * Search the message queue for any 'other' message with the same software
+ * acknowledge resource bit vector.
+ */
+struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
+                       struct bau_control *bcp, unsigned char swack_vec)
+{
+       struct bau_pq_entry *msg_next = msg + 1;
+
+       if (msg_next > bcp->queue_last)
+               msg_next = bcp->queue_first;
+       while ((msg_next->swack_vec != 0) && (msg_next != msg)) {
+               if (msg_next->swack_vec == swack_vec)
+                       return msg_next;
+               msg_next++;
+               if (msg_next > bcp->queue_last)
+                       msg_next = bcp->queue_first;
+       }
+       return NULL;
+}
+
+/*
+ * UV2 needs to work around a bug in which an arriving message has not
+ * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
+ * Such a message must be ignored.
+ */
+void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
+{
+       unsigned long mmr_image;
+       unsigned char swack_vec;
+       struct bau_pq_entry *msg = mdp->msg;
+       struct bau_pq_entry *other_msg;
+
+       mmr_image = read_mmr_sw_ack();
+       swack_vec = msg->swack_vec;
+
+       if ((swack_vec & mmr_image) == 0) {
+               /*
+                * This message was assigned a swack resource, but no
+                * reserved acknowlegment is pending.
+                * The bug has prevented this message from setting the MMR.
+                * And no other message has used the same sw_ack resource.
+                * Do the requested shootdown but do not reply to the msg.
+                * (the 0 means make no acknowledge)
+                */
+               bau_process_message(mdp, bcp, 0);
+               return;
+       }
+
+       /*
+        * Some message has set the MMR 'pending' bit; it might have been
+        * another message.  Look for that message.
+        */
+       other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
+       if (other_msg) {
+               /* There is another.  Do not ack the current one. */
+               bau_process_message(mdp, bcp, 0);
+               /*
+                * Let the natural processing of that message acknowledge
+                * it. Don't get the processing of sw_ack's out of order.
+                */
+               return;
+       }
+
+       /*
+        * There is no other message using this sw_ack, so it is safe to
+        * acknowledge it.
+        */
+       bau_process_message(mdp, bcp, 1);
+
+       return;
+}
+
+/*
   * The BAU message interrupt comes here. (registered by set_intr_gate)
   * See entry_64.S
   *
@@ -1038,9 +1241,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
                 count++;
  
                 msgdesc.msg_slot = msg - msgdesc.queue_first;
-               msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
                 msgdesc.msg = msg;
-               bau_process_message(&msgdesc, bcp);
+               if (bcp->uvhub_version == 2)
+                       process_uv2_message(&msgdesc, bcp);
+               else
+                       bau_process_message(&msgdesc, bcp, 1);
  
                 msg++;
                 if (msg > msgdesc.queue_last)
@@ -1158,7 +1363,7 @@ static int ptc_seq_show(struct seq_file *file, void *data)
                 seq_printf(file,
                         "all one mult none retry canc nocan reset rcan ");
                 seq_printf(file,
-                       "disable enable\n");
+                       "disable enable wars warshw warwaits\n");
         }
         if (cpu < num_possible_cpus() && cpu_online(cpu)) {
                 stat = &per_cpu(ptcstats, cpu);
@@ -1189,8 +1394,10 @@ static int ptc_seq_show(struct seq_file *file, void *data)
                            stat->d_nomsg, stat->d_retries, stat->d_canceled,
                            stat->d_nocanceled, stat->d_resets,
                            stat->d_rcanceled);
-               seq_printf(file, "%ld %ld\n",
-                       stat->s_bau_disabled, stat->s_bau_reenabled);
+               seq_printf(file, "%ld %ld %ld %ld %ld\n",
+                       stat->s_bau_disabled, stat->s_bau_reenabled,
+                       stat->s_uv2_wars, stat->s_uv2_wars_hw,
+                       stat->s_uv2_war_waits);
         }
         return 0;
  }
@@ -1564,6 +1771,7 @@ static void pq_init(int node, int pnode)
         write_mmr_payload_first(pnode, pn_first);
         write_mmr_payload_tail(pnode, first);
         write_mmr_payload_last(pnode, last);
+       write_gmmr_sw_ack(pnode, 0xffffUL);
  
         /* in effect, all msg_type's are set to MSG_NOOP */
         memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
@@ -1651,6 +1859,7 @@ static void __init init_per_cpu_tunables(void)
                 bcp->cong_response_us           = congested_respns_us;
                 bcp->cong_reps                  = congested_reps;
                 bcp->cong_period                = congested_period;
+               bcp->clocks_per_100_usec =      usec_2_cycles(100);
         }
  }
  
@@ -1771,6 +1980,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
                 }
                 bcp->uvhub_master = *hmasterp;
                 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+               bcp->using_desc = bcp->uvhub_cpu;
                 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
                         printk(KERN_EMERG "%d cpus per uvhub invalid\n",
                                 bcp->uvhub_cpu);
author	Cliff Wickman <cpw@sgi.com>
	Mon, 16 Jan 2012 21:19:47 +0000 (15:19 -0600)
committer	Ingo Molnar <mingo@elte.hu>
	Tue, 17 Jan 2012 08:09:54 +0000 (09:09 +0100)
arch/x86/include/asm/uv/uv_bau.h		patch \| blob \| history
arch/x86/platform/uv/tlb_uv.c		patch \| blob \| history