From f120f13ea0dbb0b0d6675683d5f6faea71277e65 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@xensource.com>
Date: Tue, 17 Jul 2007 18:37:06 -0700
Subject: [PATCH] xen: Add support for preemption

Add Xen support for preemption.  This is mostly a cleanup of existing
preempt_enable/disable calls, or just comments to explain the current
usage.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
---
 arch/i386/xen/Kconfig      |  2 +-
 arch/i386/xen/enlighten.c  | 80 ++++++++++++++++++++++++++++------------------
 arch/i386/xen/mmu.c        |  3 ++
 arch/i386/xen/multicalls.c | 11 ++++---
 arch/i386/xen/time.c       | 22 ++++++++++---
 5 files changed, 76 insertions(+), 42 deletions(-)

diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
index b7697ff..9df99e1 100644
--- a/arch/i386/xen/Kconfig
+++ b/arch/i386/xen/Kconfig
@@ -4,7 +4,7 @@
 
 config XEN
 	bool "Enable support for Xen hypervisor"
-	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !(PREEMPT || NEED_MULTIPLE_NODES)
+	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
 	help
 	  This is the Linux Xen port.  Enabling this will allow the
 	  kernel to boot in a paravirtualized environment under the
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
index de62d66..a1124b7 100644
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/preempt.h>
+#include <linux/hardirq.h>
 #include <linux/percpu.h>
 #include <linux/delay.h>
 #include <linux/start_kernel.h>
@@ -108,11 +109,10 @@ static unsigned long xen_save_fl(void)
 	struct vcpu_info *vcpu;
 	unsigned long flags;
 
-	preempt_disable();
 	vcpu = x86_read_percpu(xen_vcpu);
+
 	/* flag has opposite sense of mask */
 	flags = !vcpu->evtchn_upcall_mask;
-	preempt_enable();
 
 	/* convert to IF type flag
 	   -0 -> 0x00000000
@@ -125,32 +125,35 @@ static void xen_restore_fl(unsigned long flags)
 {
 	struct vcpu_info *vcpu;
 
-	preempt_disable();
-
 	/* convert from IF type flag */
 	flags = !(flags & X86_EFLAGS_IF);
+
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
 	vcpu = x86_read_percpu(xen_vcpu);
 	vcpu->evtchn_upcall_mask = flags;
+	preempt_enable_no_resched();
 
-	if (flags == 0) {
-		/* Unmask then check (avoid races).  We're only protecting
-		   against updates by this CPU, so there's no need for
-		   anything stronger. */
-		barrier();
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
 
+	if (flags == 0) {
+		preempt_check_resched();
+		barrier(); /* unmask then check (avoid races) */
 		if (unlikely(vcpu->evtchn_upcall_pending))
 			force_evtchn_callback();
-		preempt_enable();
-	} else
-		preempt_enable_no_resched();
+	}
 }
 
 static void xen_irq_disable(void)
 {
-	struct vcpu_info *vcpu;
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
 	preempt_disable();
-	vcpu = x86_read_percpu(xen_vcpu);
-	vcpu->evtchn_upcall_mask = 1;
+	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
 	preempt_enable_no_resched();
 }
 
@@ -158,18 +161,20 @@ static void xen_irq_enable(void)
 {
 	struct vcpu_info *vcpu;
 
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
 	preempt_disable();
 	vcpu = x86_read_percpu(xen_vcpu);
 	vcpu->evtchn_upcall_mask = 0;
+	preempt_enable_no_resched();
 
-	/* Unmask then check (avoid races).  We're only protecting
-	   against updates by this CPU, so there's no need for
-	   anything stronger. */
-	barrier();
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
 
+	barrier(); /* unmask then check (avoid races) */
 	if (unlikely(vcpu->evtchn_upcall_pending))
 		force_evtchn_callback();
-	preempt_enable();
 }
 
 static void xen_safe_halt(void)
@@ -189,6 +194,8 @@ static void xen_halt(void)
 
 static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
 {
+	BUG_ON(preemptible());
+
 	switch (mode) {
 	case PARAVIRT_LAZY_NONE:
 		BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
@@ -293,9 +300,13 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 	xmaddr_t mach_lp = virt_to_machine(lp);
 	u64 entry = (u64)high << 32 | low;
 
+	preempt_disable();
+
 	xen_mc_flush();
 	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
 		BUG();
+
+	preempt_enable();
 }
 
 static int cvt_gate_to_trap(int vector, u32 low, u32 high,
@@ -328,11 +339,13 @@ static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
 static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
 				u32 low, u32 high)
 {
-
-	int cpu = smp_processor_id();
 	unsigned long p = (unsigned long)&dt[entrynum];
-	unsigned long start = per_cpu(idt_desc, cpu).address;
-	unsigned long end = start + per_cpu(idt_desc, cpu).size + 1;
+	unsigned long start, end;
+
+	preempt_disable();
+
+	start = __get_cpu_var(idt_desc).address;
+	end = start + __get_cpu_var(idt_desc).size + 1;
 
 	xen_mc_flush();
 
@@ -347,6 +360,8 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
 			if (HYPERVISOR_set_trap_table(info))
 				BUG();
 	}
+
+	preempt_enable();
 }
 
 static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
@@ -368,11 +383,9 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
 
 void xen_copy_trap_info(struct trap_info *traps)
 {
-	const struct Xgt_desc_struct *desc = &get_cpu_var(idt_desc);
+	const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
 
 	xen_convert_trap_info(desc, traps);
-
-	put_cpu_var(idt_desc);
 }
 
 /* Load a new IDT into Xen.  In principle this can be per-CPU, so we
@@ -382,12 +395,11 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
 {
 	static DEFINE_SPINLOCK(lock);
 	static struct trap_info traps[257];
-	int cpu = smp_processor_id();
-
-	per_cpu(idt_desc, cpu) = *desc;
 
 	spin_lock(&lock);
 
+	__get_cpu_var(idt_desc) = *desc;
+
 	xen_convert_trap_info(desc, traps);
 
 	xen_mc_flush();
@@ -402,6 +414,8 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
 static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 				u32 low, u32 high)
 {
+	preempt_disable();
+
 	switch ((high >> 8) & 0xff) {
 	case DESCTYPE_LDT:
 	case DESCTYPE_TSS:
@@ -418,10 +432,12 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 	}
 
 	}
+
+	preempt_enable();
 }
 
 static void xen_load_esp0(struct tss_struct *tss,
-				   struct thread_struct *thread)
+			  struct thread_struct *thread)
 {
 	struct multicall_space mcs = xen_mc_entry(0);
 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
@@ -525,6 +541,8 @@ static unsigned long xen_read_cr3(void)
 
 static void xen_write_cr3(unsigned long cr3)
 {
+	BUG_ON(preemptible());
+
 	if (cr3 == x86_read_percpu(xen_cr3)) {
 		/* just a simple tlb flush */
 		xen_flush_tlb();
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
index bc49ef8..f431cf1 100644
--- a/arch/i386/xen/mmu.c
+++ b/arch/i386/xen/mmu.c
@@ -38,6 +38,7 @@
  *
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
+#include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/bug.h>
 #include <linux/sched.h>
@@ -531,5 +532,7 @@ void xen_exit_mmap(struct mm_struct *mm)
 	drop_mm_ref(mm);
 	put_cpu();
 
+	spin_lock(&mm->page_table_lock);
 	xen_pgd_unpin(mm->pgd);
+	spin_unlock(&mm->page_table_lock);
 }
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
index 869f983..d4015a9 100644
--- a/arch/i386/xen/multicalls.c
+++ b/arch/i386/xen/multicalls.c
@@ -20,6 +20,7 @@
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 #include <linux/percpu.h>
+#include <linux/hardirq.h>
 
 #include <asm/xen/hypercall.h>
 
@@ -39,10 +40,12 @@ DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
 
 void xen_mc_flush(void)
 {
-	struct mc_buffer *b = &get_cpu_var(mc_buffer);
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	int ret = 0;
 	unsigned long flags;
 
+	BUG_ON(preemptible());
+
 	/* Disable interrupts in case someone comes in and queues
 	   something in the middle */
 	local_irq_save(flags);
@@ -60,7 +63,6 @@ void xen_mc_flush(void)
 	} else
 		BUG_ON(b->argidx != 0);
 
-	put_cpu_var(mc_buffer);
 	local_irq_restore(flags);
 
 	BUG_ON(ret);
@@ -68,10 +70,11 @@ void xen_mc_flush(void)
 
 struct multicall_space __xen_mc_entry(size_t args)
 {
-	struct mc_buffer *b = &get_cpu_var(mc_buffer);
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	struct multicall_space ret;
 	unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
 
+	BUG_ON(preemptible());
 	BUG_ON(argspace > MC_ARGS);
 
 	if (b->mcidx == MC_BATCH ||
@@ -83,7 +86,5 @@ struct multicall_space __xen_mc_entry(size_t args)
 	ret.args = &b->args[b->argidx];
 	b->argidx += argspace;
 
-	put_cpu_var(mc_buffer);
-
 	return ret;
 }
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
index aeb04cf..51fdabf 100644
--- a/arch/i386/xen/time.c
+++ b/arch/i386/xen/time.c
@@ -88,7 +88,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
 	u64 state_time;
 	struct vcpu_runstate_info *state;
 
-	preempt_disable();
+	BUG_ON(preemptible());
 
 	state = &__get_cpu_var(runstate);
 
@@ -103,8 +103,6 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
 		*res = *state;
 		barrier();
 	} while (get64(&state->state_entry_time) != state_time);
-
-	preempt_enable();
 }
 
 static void setup_runstate_info(int cpu)
@@ -179,9 +177,19 @@ static void do_stolen_accounting(void)
 unsigned long long xen_sched_clock(void)
 {
 	struct vcpu_runstate_info state;
-	cycle_t now = xen_clocksource_read();
+	cycle_t now;
+	u64 ret;
 	s64 offset;
 
+	/*
+	 * Ideally sched_clock should be called on a per-cpu basis
+	 * anyway, so preempt should already be disabled, but that's
+	 * not current practice at the moment.
+	 */
+	preempt_disable();
+
+	now = xen_clocksource_read();
+
 	get_runstate_snapshot(&state);
 
 	WARN_ON(state.state != RUNSTATE_running);
@@ -190,9 +198,13 @@ unsigned long long xen_sched_clock(void)
 	if (offset < 0)
 		offset = 0;
 
-	return state.time[RUNSTATE_blocked] +
+	ret = state.time[RUNSTATE_blocked] +
 		state.time[RUNSTATE_running] +
 		offset;
+
+	preempt_enable();
+
+	return ret;
 }
 
 
-- 
2.7.4