powerpc/oprofile: Fix mutex locking for cell spu-oprofile

author Carl Love <cel@us.ibm.com>

Tue, 14 Oct 2008 23:37:01 +0000 (23:37 +0000)

committer Benjamin Herrenschmidt <benh@kernel.crashing.org>

Tue, 21 Oct 2008 04:17:48 +0000 (15:17 +1100)
author Carl Love <cel@us.ibm.com>
Tue, 14 Oct 2008 23:37:01 +0000 (23:37 +0000)
committer Benjamin Herrenschmidt <benh@kernel.crashing.org>
Tue, 21 Oct 2008 04:17:48 +0000 (15:17 +1100)
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h

index 22e4e8d..628009c 100644 (file)
--- a/arch/powerpc/oprofile/cell/pr_util.h
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -24,6 +24,11 @@
  #define SKIP_GENERIC_SYNC 0
  #define SYNC_START_ERROR -1
  #define DO_GENERIC_SYNC 1
+#define SPUS_PER_NODE   8
+#define DEFAULT_TIMER_EXPIRE  (HZ / 10)
+
+extern struct delayed_work spu_work;
+extern int spu_prof_running;
  
  struct spu_overlay_info {      /* map of sections within an SPU overlay */
         unsigned int vma;       /* SPU virtual memory address from elf */
@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map {       /* map of sections within an SPU program */
  
  };
  
+struct spu_buffer {
+       int last_guard_val;
+       int ctx_sw_seen;
+       unsigned long *buff;
+       unsigned int head, tail;
+};
+
+
  /* The three functions below are for maintaining and accessing
   * the vma-to-fileoffset map.
   */
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c

index 380d7e2..6edaebd 100644 (file)
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -23,12 +23,11 @@
  
  static u32 *samples;
  
-static int spu_prof_running;
+int spu_prof_running;
  static unsigned int profiling_interval;
  
  #define NUM_SPU_BITS_TRBUF 16
  #define SPUS_PER_TB_ENTRY   4
-#define SPUS_PER_NODE       8
  
  #define SPU_PC_MASK         0xFFFF
  
@@ -208,6 +207,7 @@ int start_spu_profiling(unsigned int cycles_reset)
  
         spu_prof_running = 1;
         hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
+       schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
  
         return 0;
  }
diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c

index 2a9b4a0..2949126 100644 (file)
--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
+++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
  static DEFINE_SPINLOCK(cache_lock);
  static int num_spu_nodes;
  int spu_prof_num_nodes;
-int last_guard_val[MAX_NUMNODES * 8];
+
+struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
+struct delayed_work spu_work;
+static unsigned max_spu_buff;
+
+static void spu_buff_add(unsigned long int value, int spu)
+{
+       /* spu buff is a circular buffer.  Add entries to the
+        * head.  Head is the index to store the next value.
+        * The buffer is full when there is one available entry
+        * in the queue, i.e. head and tail can't be equal.
+        * That way we can tell the difference between the
+        * buffer being full versus empty.
+        *
+        *  ASSUPTION: the buffer_lock is held when this function
+        *             is called to lock the buffer, head and tail.
+        */
+       int full = 1;
+
+       if (spu_buff[spu].head >= spu_buff[spu].tail) {
+               if ((spu_buff[spu].head - spu_buff[spu].tail)
+                   <  (max_spu_buff - 1))
+                       full = 0;
+
+       } else if (spu_buff[spu].tail > spu_buff[spu].head) {
+               if ((spu_buff[spu].tail - spu_buff[spu].head)
+                   > 1)
+                       full = 0;
+       }
+
+       if (!full) {
+               spu_buff[spu].buff[spu_buff[spu].head] = value;
+               spu_buff[spu].head++;
+
+               if (spu_buff[spu].head >= max_spu_buff)
+                       spu_buff[spu].head = 0;
+       } else {
+               /* From the user's perspective make the SPU buffer
+                * size management/overflow look like we are using
+                * per cpu buffers.  The user uses the same
+                * per cpu parameter to adjust the SPU buffer size.
+                * Increment the sample_lost_overflow to inform
+                * the user the buffer size needs to be increased.
+                */
+               oprofile_cpu_buffer_inc_smpl_lost();
+       }
+}
+
+/* This function copies the per SPU buffers to the
+ * OProfile kernel buffer.
+ */
+void sync_spu_buff(void)
+{
+       int spu;
+       unsigned long flags;
+       int curr_head;
+
+       for (spu = 0; spu < num_spu_nodes; spu++) {
+               /* In case there was an issue and the buffer didn't
+                * get created skip it.
+                */
+               if (spu_buff[spu].buff == NULL)
+                       continue;
+
+               /* Hold the lock to make sure the head/tail
+                * doesn't change while spu_buff_add() is
+                * deciding if the buffer is full or not.
+                * Being a little paranoid.
+                */
+               spin_lock_irqsave(&buffer_lock, flags);
+               curr_head = spu_buff[spu].head;
+               spin_unlock_irqrestore(&buffer_lock, flags);
+
+               /* Transfer the current contents to the kernel buffer.
+                * data can still be added to the head of the buffer.
+                */
+               oprofile_put_buff(spu_buff[spu].buff,
+                                 spu_buff[spu].tail,
+                                 curr_head, max_spu_buff);
+
+               spin_lock_irqsave(&buffer_lock, flags);
+               spu_buff[spu].tail = curr_head;
+               spin_unlock_irqrestore(&buffer_lock, flags);
+       }
+
+}
+
+static void wq_sync_spu_buff(struct work_struct *work)
+{
+       /* move data from spu buffers to kernel buffer */
+       sync_spu_buff();
+
+       /* only reschedule if profiling is not done */
+       if (spu_prof_running)
+               schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
+}
  
  /* Container for caching information about an active SPU task. */
  struct cached_info {
@@ -305,14 +400,21 @@ static int process_context_switch(struct spu *spu, unsigned long objectId)
  
         /* Record context info in event buffer */
         spin_lock_irqsave(&buffer_lock, flags);
-       add_event_entry(ESCAPE_CODE);
-       add_event_entry(SPU_CTX_SWITCH_CODE);
-       add_event_entry(spu->number);
-       add_event_entry(spu->pid);
-       add_event_entry(spu->tgid);
-       add_event_entry(app_dcookie);
-       add_event_entry(spu_cookie);
-       add_event_entry(offset);
+       spu_buff_add(ESCAPE_CODE, spu->number);
+       spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
+       spu_buff_add(spu->number, spu->number);
+       spu_buff_add(spu->pid, spu->number);
+       spu_buff_add(spu->tgid, spu->number);
+       spu_buff_add(app_dcookie, spu->number);
+       spu_buff_add(spu_cookie, spu->number);
+       spu_buff_add(offset, spu->number);
+
+       /* Set flag to indicate SPU PC data can now be written out.  If
+        * the SPU program counter data is seen before an SPU context
+        * record is seen, the postprocessing will fail.
+        */
+       spu_buff[spu->number].ctx_sw_seen = 1;
+
         spin_unlock_irqrestore(&buffer_lock, flags);
         smp_wmb();      /* insure spu event buffer updates are written */
                         /* don't want entries intermingled... */
@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
          return nodes;
  }
  
+static int oprofile_spu_buff_create(void)
+{
+       int spu;
+
+       max_spu_buff = oprofile_get_cpu_buffer_size();
+
+       for (spu = 0; spu < num_spu_nodes; spu++) {
+               /* create circular buffers to store the data in.
+                * use locks to manage accessing the buffers
+                */
+               spu_buff[spu].head = 0;
+               spu_buff[spu].tail = 0;
+
+               /*
+                * Create a buffer for each SPU.  Can't reliably
+                * create a single buffer for all spus due to not
+                * enough contiguous kernel memory.
+                */
+
+               spu_buff[spu].buff = kzalloc((max_spu_buff
+                                             * sizeof(unsigned long)),
+                                            GFP_KERNEL);
+
+               if (!spu_buff[spu].buff) {
+                       printk(KERN_ERR "SPU_PROF: "
+                              "%s, line %d:  oprofile_spu_buff_create "
+                      "failed to allocate spu buffer %d.\n",
+                              __func__, __LINE__, spu);
+
+                       /* release the spu buffers that have been allocated */
+                       while (spu >= 0) {
+                               kfree(spu_buff[spu].buff);
+                               spu_buff[spu].buff = 0;
+                               spu--;
+                       }
+                       return -ENOMEM;
+               }
+       }
+       return 0;
+}
+
  /* The main purpose of this function is to synchronize
   * OProfile with SPUFS by registering to be notified of
   * SPU task switches.
@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
   */
  int spu_sync_start(void)
  {
-       int k;
+       int spu;
         int ret = SKIP_GENERIC_SYNC;
         int register_ret;
         unsigned long flags = 0;
  
         spu_prof_num_nodes = number_of_online_nodes();
         num_spu_nodes = spu_prof_num_nodes * 8;
+       INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
+
+       /* create buffer for storing the SPU data to put in
+        * the kernel buffer.
+        */
+       ret = oprofile_spu_buff_create();
+       if (ret)
+               goto out;
  
         spin_lock_irqsave(&buffer_lock, flags);
-       add_event_entry(ESCAPE_CODE);
-       add_event_entry(SPU_PROFILING_CODE);
-       add_event_entry(num_spu_nodes);
+       for (spu = 0; spu < num_spu_nodes; spu++) {
+               spu_buff_add(ESCAPE_CODE, spu);
+               spu_buff_add(SPU_PROFILING_CODE, spu);
+               spu_buff_add(num_spu_nodes, spu);
+       }
         spin_unlock_irqrestore(&buffer_lock, flags);
  
+       for (spu = 0; spu < num_spu_nodes; spu++) {
+               spu_buff[spu].ctx_sw_seen = 0;
+               spu_buff[spu].last_guard_val = 0;
+       }
+
         /* Register for SPU events  */
         register_ret = spu_switch_event_register(&spu_active);
         if (register_ret) {
@@ -393,8 +551,6 @@ int spu_sync_start(void)
                 goto out;
         }
  
-       for (k = 0; k < (MAX_NUMNODES * 8); k++)
-               last_guard_val[k] = 0;
         pr_debug("spu_sync_start -- running.\n");
  out:
         return ret;
@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsigned int *samples,
                  * use.  We need to discard samples taken during the time
                  * period which an overlay occurs (i.e., guard value changes).
                  */
-               if (grd_val && grd_val != last_guard_val[spu_num]) {
-                       last_guard_val[spu_num] = grd_val;
+               if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
+                       spu_buff[spu_num].last_guard_val = grd_val;
                         /* Drop the rest of the samples. */
                         break;
                 }
  
-               add_event_entry(file_offset | spu_num_shifted);
+               /* We must ensure that the SPU context switch has been written
+                * out before samples for the SPU.  Otherwise, the SPU context
+                * information is not available and the postprocessing of the
+                * SPU PC will fail with no available anonymous map information.
+                */
+               if (spu_buff[spu_num].ctx_sw_seen)
+                       spu_buff_add((file_offset | spu_num_shifted),
+                                        spu_num);
         }
         spin_unlock(&buffer_lock);
  out:
@@ -463,20 +626,41 @@ out:
  int spu_sync_stop(void)
  {
         unsigned long flags = 0;
-       int ret = spu_switch_event_unregister(&spu_active);
-       if (ret) {
+       int ret;
+       int k;
+
+       ret = spu_switch_event_unregister(&spu_active);
+
+       if (ret)
                 printk(KERN_ERR "SPU_PROF: "
-                       "%s, line %d: spu_switch_event_unregister returned %d\n",
-                       __func__, __LINE__, ret);
-               goto out;
-       }
+                      "%s, line %d: spu_switch_event_unregister "      \
+                      "returned %d\n",
+                      __func__, __LINE__, ret);
+
+       /* flush any remaining data in the per SPU buffers */
+       sync_spu_buff();
  
         spin_lock_irqsave(&cache_lock, flags);
         ret = release_cached_info(RELEASE_ALL);
         spin_unlock_irqrestore(&cache_lock, flags);
-out:
+
+       /* remove scheduled work queue item rather then waiting
+        * for every queued entry to execute.  Then flush pending
+        * system wide buffer to event buffer.
+        */
+       cancel_delayed_work(&spu_work);
+
+       for (k = 0; k < num_spu_nodes; k++) {
+               spu_buff[k].ctx_sw_seen = 0;
+
+               /*
+                * spu_sys_buff will be null if there was a problem
+                * allocating the buffer.  Only delete if it exists.
+                */
+               kfree(spu_buff[k].buff);
+               spu_buff[k].buff = 0;
+       }
         pr_debug("spu_sync_stop -- done.\n");
         return ret;
  }
  
-
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c

index ed98227..3768170 100644 (file)
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -628,3 +628,27 @@ void sync_buffer(int cpu)
  
         mutex_unlock(&buffer_mutex);
  }
+
+/* The function can be used to add a buffer worth of data directly to
+ * the kernel buffer. The buffer is assumed to be a circular buffer.
+ * Take the entries from index start and end at index end, wrapping
+ * at max_entries.
+ */
+void oprofile_put_buff(unsigned long *buf, unsigned int start,
+                      unsigned int stop, unsigned int max)
+{
+       int i;
+
+       i = start;
+
+       mutex_lock(&buffer_mutex);
+       while (i != stop) {
+               add_event_entry(buf[i++]);
+
+               if (i >= max)
+                       i = 0;
+       }
+
+       mutex_unlock(&buffer_mutex);
+}
+
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c

index e1bd5a9..7ba39fe 100644 (file)
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -38,13 +38,26 @@ static int work_enabled;
  void free_cpu_buffers(void)
  {
         int i;
- 
+
         for_each_online_cpu(i) {
                 vfree(per_cpu(cpu_buffer, i).buffer);
                 per_cpu(cpu_buffer, i).buffer = NULL;
         }
  }
  
+unsigned long oprofile_get_cpu_buffer_size(void)
+{
+       return fs_cpu_buffer_size;
+}
+
+void oprofile_cpu_buffer_inc_smpl_lost(void)
+{
+       struct oprofile_cpu_buffer *cpu_buf
+               = &__get_cpu_var(cpu_buffer);
+
+       cpu_buf->sample_lost_overflow++;
+}
+
  int alloc_cpu_buffers(void)
  {
         int i;
diff --git a/drivers/oprofile/event_buffer.h b/drivers/oprofile/event_buffer.h

index 5076ed1..84bf324 100644 (file)
--- a/drivers/oprofile/event_buffer.h
+++ b/drivers/oprofile/event_buffer.h
@@ -17,6 +17,13 @@ int alloc_event_buffer(void);
  
  void free_event_buffer(void);
   
+/**
+ * Add data to the event buffer.
+ * The data passed is free-form, but typically consists of
+ * file offsets, dcookies, context information, and ESCAPE codes.
+ */
+void add_event_entry(unsigned long data);
+
  /* wake up the process sleeping on the event file */
  void wake_up_buffer_waiter(void);
  
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h

index bcb8f72..5231861 100644 (file)
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -86,13 +86,6 @@ int oprofile_arch_init(struct oprofile_operations * ops);
  void oprofile_arch_exit(void);
  
  /**
- * Add data to the event buffer.
- * The data passed is free-form, but typically consists of
- * file offsets, dcookies, context information, and ESCAPE codes.
- */
-void add_event_entry(unsigned long data);
-
-/**
   * Add a sample. This may be called from any context. Pass
   * smp_processor_id() as cpu.
   */
@@ -162,5 +155,14 @@ int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, siz
  
  /** lock for read/write safety */
  extern spinlock_t oprofilefs_lock;
+
+/**
+ * Add the contents of a circular buffer to the event buffer.
+ */
+void oprofile_put_buff(unsigned long *buf, unsigned int start,
+                       unsigned int stop, unsigned int max);
+
+unsigned long oprofile_get_cpu_buffer_size(void);
+void oprofile_cpu_buffer_inc_smpl_lost(void);
   
  #endif /* OPROFILE_H */
author	Carl Love <cel@us.ibm.com>
	Tue, 14 Oct 2008 23:37:01 +0000 (23:37 +0000)
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>
	Tue, 21 Oct 2008 04:17:48 +0000 (15:17 +1100)
arch/powerpc/oprofile/cell/pr_util.h		patch \| blob \| history
arch/powerpc/oprofile/cell/spu_profiler.c		patch \| blob \| history
arch/powerpc/oprofile/cell/spu_task_sync.c		patch \| blob \| history
drivers/oprofile/buffer_sync.c		patch \| blob \| history
drivers/oprofile/cpu_buffer.c		patch \| blob \| history
drivers/oprofile/event_buffer.h		patch \| blob \| history
include/linux/oprofile.h		patch \| blob \| history